{ "best_metric": 0.12010584022747946, "best_model_checkpoint": "/workspace/disk2/krishna/checkpoints/checkpoint-940", "epoch": 0.097, "eval_steps": 10, "global_step": 970, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 0.0356324203312397, "learning_rate": 1e-05, "loss": 0.1207, "step": 10 }, { "epoch": 0.001, "eval_cos_sim": 0.8792359232902527, "eval_loss": 0.12173220255124045, "eval_runtime": 171.7728, "eval_samples_per_second": 23.287, "eval_steps_per_second": 0.367, "step": 10 }, { "epoch": 0.002, "grad_norm": 0.030419372022151947, "learning_rate": 2e-05, "loss": 0.1213, "step": 20 }, { "epoch": 0.002, "eval_cos_sim": 0.8789854049682617, "eval_loss": 0.12198310520398092, "eval_runtime": 159.1521, "eval_samples_per_second": 25.133, "eval_steps_per_second": 0.396, "step": 20 }, { "epoch": 0.003, "grad_norm": 0.033041320741176605, "learning_rate": 3e-05, "loss": 0.1204, "step": 30 }, { "epoch": 0.003, "eval_cos_sim": 0.8790653347969055, "eval_loss": 0.12189955379712057, "eval_runtime": 161.0741, "eval_samples_per_second": 24.833, "eval_steps_per_second": 0.391, "step": 30 }, { "epoch": 0.004, "grad_norm": 0.04209210351109505, "learning_rate": 4e-05, "loss": 0.1213, "step": 40 }, { "epoch": 0.004, "eval_cos_sim": 0.8792155385017395, "eval_loss": 0.12175503399121237, "eval_runtime": 159.4228, "eval_samples_per_second": 25.091, "eval_steps_per_second": 0.395, "step": 40 }, { "epoch": 0.005, "grad_norm": 0.03182140365242958, "learning_rate": 5e-05, "loss": 0.1203, "step": 50 }, { "epoch": 0.005, "eval_cos_sim": 0.8791962265968323, "eval_loss": 0.12176946785199118, "eval_runtime": 160.3207, "eval_samples_per_second": 24.95, "eval_steps_per_second": 0.393, "step": 50 }, { "epoch": 0.006, "grad_norm": 0.05823719501495361, "learning_rate": 2.4802665827257164e-05, "loss": 0.1213, "step": 60 }, { "epoch": 0.006, "eval_cos_sim": 0.8791635036468506, "eval_loss": 0.12179789688336325, "eval_runtime": 164.7575, "eval_samples_per_second": 24.278, "eval_steps_per_second": 0.382, "step": 60 }, { "epoch": 0.007, "grad_norm": 0.02305755950510502, "learning_rate": 4.999688473794144e-05, "loss": 0.1211, "step": 70 }, { "epoch": 0.007, "eval_cos_sim": 0.8792662024497986, "eval_loss": 0.12169700815426779, "eval_runtime": 159.8158, "eval_samples_per_second": 25.029, "eval_steps_per_second": 0.394, "step": 70 }, { "epoch": 0.008, "grad_norm": 0.03906348720192909, "learning_rate": 2.4408046661584414e-05, "loss": 0.1201, "step": 80 }, { "epoch": 0.008, "eval_cos_sim": 0.879157543182373, "eval_loss": 0.12181253530728293, "eval_runtime": 162.3324, "eval_samples_per_second": 24.641, "eval_steps_per_second": 0.388, "step": 80 }, { "epoch": 0.009, "grad_norm": 0.0275803804397583, "learning_rate": 4.998753972815434e-05, "loss": 0.1208, "step": 90 }, { "epoch": 0.009, "eval_cos_sim": 0.8792516589164734, "eval_loss": 0.121718086264009, "eval_runtime": 158.5035, "eval_samples_per_second": 25.236, "eval_steps_per_second": 0.397, "step": 90 }, { "epoch": 0.01, "grad_norm": 0.03248042240738869, "learning_rate": 2.4013575023093667e-05, "loss": 0.1224, "step": 100 }, { "epoch": 0.01, "eval_cos_sim": 0.879462718963623, "eval_loss": 0.1215014844154067, "eval_runtime": 159.5892, "eval_samples_per_second": 25.064, "eval_steps_per_second": 0.395, "step": 100 }, { "epoch": 0.011, "grad_norm": 0.03436814621090889, "learning_rate": 4.9971967299611097e-05, "loss": 0.1205, "step": 110 }, { "epoch": 0.011, "eval_cos_sim": 0.8795427680015564, "eval_loss": 0.1214234667037673, "eval_runtime": 168.2892, "eval_samples_per_second": 23.769, "eval_steps_per_second": 0.374, "step": 110 }, { "epoch": 0.012, "grad_norm": 0.03663235530257225, "learning_rate": 2.3619349222387287e-05, "loss": 0.1209, "step": 120 }, { "epoch": 0.012, "eval_cos_sim": 0.8793898224830627, "eval_loss": 0.1215821056579299, "eval_runtime": 170.7269, "eval_samples_per_second": 23.429, "eval_steps_per_second": 0.369, "step": 120 }, { "epoch": 0.013, "grad_norm": 0.03549114614725113, "learning_rate": 4.9950171333287335e-05, "loss": 0.1218, "step": 130 }, { "epoch": 0.013, "eval_cos_sim": 0.8795300722122192, "eval_loss": 0.12144066002118063, "eval_runtime": 162.7257, "eval_samples_per_second": 24.581, "eval_steps_per_second": 0.387, "step": 130 }, { "epoch": 0.014, "grad_norm": 0.03164505586028099, "learning_rate": 2.3225467508799633e-05, "loss": 0.1208, "step": 140 }, { "epoch": 0.014, "eval_cos_sim": 0.8797659873962402, "eval_loss": 0.12119961311566306, "eval_runtime": 163.5115, "eval_samples_per_second": 24.463, "eval_steps_per_second": 0.385, "step": 140 }, { "epoch": 0.015, "grad_norm": 0.031108180060982704, "learning_rate": 4.992215726119483e-05, "loss": 0.1213, "step": 150 }, { "epoch": 0.015, "eval_cos_sim": 0.8797353506088257, "eval_loss": 0.1212306742881484, "eval_runtime": 165.866, "eval_samples_per_second": 24.116, "eval_steps_per_second": 0.38, "step": 150 }, { "epoch": 0.016, "grad_norm": 0.030103642493486404, "learning_rate": 2.2832028045911203e-05, "loss": 0.1209, "step": 160 }, { "epoch": 0.016, "eval_cos_sim": 0.8793777823448181, "eval_loss": 0.12159336998211813, "eval_runtime": 171.7298, "eval_samples_per_second": 23.292, "eval_steps_per_second": 0.367, "step": 160 }, { "epoch": 0.017, "grad_norm": 0.05055614188313484, "learning_rate": 4.9887932065027656e-05, "loss": 0.1204, "step": 170 }, { "epoch": 0.017, "eval_cos_sim": 0.8795183300971985, "eval_loss": 0.1214520344947524, "eval_runtime": 162.2461, "eval_samples_per_second": 24.654, "eval_steps_per_second": 0.388, "step": 170 }, { "epoch": 0.018, "grad_norm": 0.03837039694190025, "learning_rate": 2.2439128887084646e-05, "loss": 0.1202, "step": 180 }, { "epoch": 0.018, "eval_cos_sim": 0.8797397017478943, "eval_loss": 0.12122445728527975, "eval_runtime": 161.451, "eval_samples_per_second": 24.775, "eval_steps_per_second": 0.39, "step": 180 }, { "epoch": 0.019, "grad_norm": 0.03563898801803589, "learning_rate": 4.98475042744222e-05, "loss": 0.1221, "step": 190 }, { "epoch": 0.019, "eval_cos_sim": 0.8797785639762878, "eval_loss": 0.12118578769909812, "eval_runtime": 157.9064, "eval_samples_per_second": 25.331, "eval_steps_per_second": 0.399, "step": 190 }, { "epoch": 0.02, "grad_norm": 0.0392858162522316, "learning_rate": 2.204686795102736e-05, "loss": 0.1204, "step": 200 }, { "epoch": 0.02, "eval_cos_sim": 0.8796395063400269, "eval_loss": 0.12133027555691672, "eval_runtime": 163.7884, "eval_samples_per_second": 24.422, "eval_steps_per_second": 0.385, "step": 200 }, { "epoch": 0.021, "grad_norm": 0.04556349664926529, "learning_rate": 4.980088396483144e-05, "loss": 0.1205, "step": 210 }, { "epoch": 0.021, "eval_cos_sim": 0.8796445727348328, "eval_loss": 0.12132433941113424, "eval_runtime": 164.6589, "eval_samples_per_second": 24.293, "eval_steps_per_second": 0.383, "step": 210 }, { "epoch": 0.022, "grad_norm": 0.030130930244922638, "learning_rate": 2.1655342997387947e-05, "loss": 0.1201, "step": 220 }, { "epoch": 0.022, "eval_cos_sim": 0.8796879649162292, "eval_loss": 0.12127337600933981, "eval_runtime": 163.052, "eval_samples_per_second": 24.532, "eval_steps_per_second": 0.386, "step": 220 }, { "epoch": 0.023, "grad_norm": 0.027453621849417686, "learning_rate": 4.9748082755013934e-05, "loss": 0.1205, "step": 230 }, { "epoch": 0.023, "eval_cos_sim": 0.8797481060028076, "eval_loss": 0.12121248771893454, "eval_runtime": 159.6988, "eval_samples_per_second": 25.047, "eval_steps_per_second": 0.394, "step": 230 }, { "epoch": 0.024, "grad_norm": 0.029768602922558784, "learning_rate": 2.126465160239341e-05, "loss": 0.1206, "step": 240 }, { "epoch": 0.024, "eval_cos_sim": 0.8797679543495178, "eval_loss": 0.12119435026394797, "eval_runtime": 170.5172, "eval_samples_per_second": 23.458, "eval_steps_per_second": 0.369, "step": 240 }, { "epoch": 0.025, "grad_norm": 0.025975426658988, "learning_rate": 4.968911380413809e-05, "loss": 0.1206, "step": 250 }, { "epoch": 0.025, "eval_cos_sim": 0.8798050284385681, "eval_loss": 0.12115855839001609, "eval_runtime": 162.5635, "eval_samples_per_second": 24.606, "eval_steps_per_second": 0.388, "step": 250 }, { "epoch": 0.026, "grad_norm": 0.032136961817741394, "learning_rate": 2.0874891134530094e-05, "loss": 0.1207, "step": 260 }, { "epoch": 0.026, "eval_cos_sim": 0.8799233436584473, "eval_loss": 0.12104250910031271, "eval_runtime": 171.0657, "eval_samples_per_second": 23.383, "eval_steps_per_second": 0.368, "step": 260 }, { "epoch": 0.027, "grad_norm": 0.035989198833703995, "learning_rate": 4.962399180850275e-05, "loss": 0.12, "step": 270 }, { "epoch": 0.027, "eval_cos_sim": 0.8800029754638672, "eval_loss": 0.12096367742764426, "eval_runtime": 162.5704, "eval_samples_per_second": 24.605, "eval_steps_per_second": 0.388, "step": 270 }, { "epoch": 0.028, "grad_norm": 0.02917526848614216, "learning_rate": 2.0486158730277393e-05, "loss": 0.1205, "step": 280 }, { "epoch": 0.028, "eval_cos_sim": 0.8800209164619446, "eval_loss": 0.12094438698040914, "eval_runtime": 163.135, "eval_samples_per_second": 24.52, "eval_steps_per_second": 0.386, "step": 280 }, { "epoch": 0.029, "grad_norm": 0.040587518364191055, "learning_rate": 4.955273299787453e-05, "loss": 0.1204, "step": 290 }, { "epoch": 0.029, "eval_cos_sim": 0.8800665140151978, "eval_loss": 0.12090009071576072, "eval_runtime": 160.8422, "eval_samples_per_second": 24.869, "eval_steps_per_second": 0.392, "step": 290 }, { "epoch": 0.03, "grad_norm": 0.02535935305058956, "learning_rate": 2.00985512699005e-05, "loss": 0.121, "step": 300 }, { "epoch": 0.03, "eval_cos_sim": 0.8799148201942444, "eval_loss": 0.12105432750927878, "eval_runtime": 162.6443, "eval_samples_per_second": 24.594, "eval_steps_per_second": 0.387, "step": 300 }, { "epoch": 0.031, "grad_norm": 0.027923179790377617, "learning_rate": 4.947535513144286e-05, "loss": 0.1197, "step": 310 }, { "epoch": 0.031, "eval_cos_sim": 0.8799843788146973, "eval_loss": 0.120985016367311, "eval_runtime": 165.6185, "eval_samples_per_second": 24.152, "eval_steps_per_second": 0.38, "step": 310 }, { "epoch": 0.032, "grad_norm": 0.025140805169939995, "learning_rate": 1.9712165353304617e-05, "loss": 0.1199, "step": 320 }, { "epoch": 0.032, "eval_cos_sim": 0.8800230622291565, "eval_loss": 0.1209463920806594, "eval_runtime": 161.2564, "eval_samples_per_second": 24.805, "eval_steps_per_second": 0.391, "step": 320 }, { "epoch": 0.033, "grad_norm": 0.03448393940925598, "learning_rate": 4.9391877493394335e-05, "loss": 0.1205, "step": 330 }, { "epoch": 0.033, "eval_cos_sim": 0.8800433278083801, "eval_loss": 0.12092636968838645, "eval_runtime": 165.583, "eval_samples_per_second": 24.157, "eval_steps_per_second": 0.38, "step": 330 }, { "epoch": 0.034, "grad_norm": 0.027445893734693527, "learning_rate": 1.9327097275960212e-05, "loss": 0.1208, "step": 340 }, { "epoch": 0.034, "eval_cos_sim": 0.8797867894172668, "eval_loss": 0.12118107464062644, "eval_runtime": 169.9936, "eval_samples_per_second": 23.53, "eval_steps_per_second": 0.371, "step": 340 }, { "epoch": 0.035, "grad_norm": 0.032430149614810944, "learning_rate": 4.9302320888106454e-05, "loss": 0.1192, "step": 350 }, { "epoch": 0.035, "eval_cos_sim": 0.8799253106117249, "eval_loss": 0.12104199745404196, "eval_runtime": 162.0388, "eval_samples_per_second": 24.685, "eval_steps_per_second": 0.389, "step": 350 }, { "epoch": 0.036, "grad_norm": 0.03066575713455677, "learning_rate": 1.894344300490539e-05, "loss": 0.1207, "step": 360 }, { "epoch": 0.036, "eval_cos_sim": 0.8800117373466492, "eval_loss": 0.12095709469067527, "eval_runtime": 168.2521, "eval_samples_per_second": 23.774, "eval_steps_per_second": 0.374, "step": 360 }, { "epoch": 0.037, "grad_norm": 0.04023744910955429, "learning_rate": 4.920670763496264e-05, "loss": 0.1206, "step": 370 }, { "epoch": 0.037, "eval_cos_sim": 0.8800341486930847, "eval_loss": 0.12093350794064474, "eval_runtime": 164.2446, "eval_samples_per_second": 24.354, "eval_steps_per_second": 0.384, "step": 370 }, { "epoch": 0.038, "grad_norm": 0.03345053270459175, "learning_rate": 1.8561298154827563e-05, "loss": 0.1207, "step": 380 }, { "epoch": 0.038, "eval_cos_sim": 0.8800336122512817, "eval_loss": 0.12093256904828024, "eval_runtime": 158.5429, "eval_samples_per_second": 25.23, "eval_steps_per_second": 0.397, "step": 380 }, { "epoch": 0.039, "grad_norm": 0.02383916825056076, "learning_rate": 4.910506156279026e-05, "loss": 0.1213, "step": 390 }, { "epoch": 0.039, "eval_cos_sim": 0.8800181150436401, "eval_loss": 0.12094816543805074, "eval_runtime": 164.6828, "eval_samples_per_second": 24.289, "eval_steps_per_second": 0.383, "step": 390 }, { "epoch": 0.04, "grad_norm": 0.03217790648341179, "learning_rate": 1.8180757964234907e-05, "loss": 0.1213, "step": 400 }, { "epoch": 0.04, "eval_cos_sim": 0.8800681829452515, "eval_loss": 0.12089718677746726, "eval_runtime": 162.9873, "eval_samples_per_second": 24.542, "eval_steps_per_second": 0.387, "step": 400 }, { "epoch": 0.041, "grad_norm": 0.03514571115374565, "learning_rate": 4.8997408003921466e-05, "loss": 0.1208, "step": 410 }, { "epoch": 0.041, "eval_cos_sim": 0.8801241517066956, "eval_loss": 0.12084018089520407, "eval_runtime": 167.298, "eval_samples_per_second": 23.909, "eval_steps_per_second": 0.377, "step": 410 }, { "epoch": 0.042, "grad_norm": 0.03063860908150673, "learning_rate": 1.780191727172083e-05, "loss": 0.1207, "step": 420 }, { "epoch": 0.042, "eval_cos_sim": 0.8799417018890381, "eval_loss": 0.12102477314221335, "eval_runtime": 165.8735, "eval_samples_per_second": 24.115, "eval_steps_per_second": 0.38, "step": 420 }, { "epoch": 0.043, "grad_norm": 0.0319770872592926, "learning_rate": 4.8883773787879826e-05, "loss": 0.1205, "step": 430 }, { "epoch": 0.043, "eval_cos_sim": 0.8800466060638428, "eval_loss": 0.12091960479962302, "eval_runtime": 163.3913, "eval_samples_per_second": 24.481, "eval_steps_per_second": 0.386, "step": 430 }, { "epoch": 0.044, "grad_norm": 0.02543482929468155, "learning_rate": 1.742487049232818e-05, "loss": 0.1202, "step": 440 }, { "epoch": 0.044, "eval_cos_sim": 0.8802583813667297, "eval_loss": 0.12070597412335349, "eval_runtime": 160.473, "eval_samples_per_second": 24.926, "eval_steps_per_second": 0.393, "step": 440 }, { "epoch": 0.045, "grad_norm": 0.024107394739985466, "learning_rate": 4.876418723469453e-05, "loss": 0.1196, "step": 450 }, { "epoch": 0.045, "eval_cos_sim": 0.8802623748779297, "eval_loss": 0.12070202877270651, "eval_runtime": 168.2567, "eval_samples_per_second": 23.773, "eval_steps_per_second": 0.374, "step": 450 }, { "epoch": 0.046, "grad_norm": 0.04505016654729843, "learning_rate": 1.7049711594019046e-05, "loss": 0.1197, "step": 460 }, { "epoch": 0.046, "eval_cos_sim": 0.8800992965698242, "eval_loss": 0.12086664869534446, "eval_runtime": 168.3415, "eval_samples_per_second": 23.761, "eval_steps_per_second": 0.374, "step": 460 }, { "epoch": 0.047, "grad_norm": 0.026298915967345238, "learning_rate": 4.8638678147841726e-05, "loss": 0.1207, "step": 470 }, { "epoch": 0.047, "eval_cos_sim": 0.8801872134208679, "eval_loss": 0.12077882673489523, "eval_runtime": 171.1708, "eval_samples_per_second": 23.368, "eval_steps_per_second": 0.368, "step": 470 }, { "epoch": 0.048, "grad_norm": 0.04072026535868645, "learning_rate": 1.667653407425599e-05, "loss": 0.12, "step": 480 }, { "epoch": 0.048, "eval_cos_sim": 0.8804126977920532, "eval_loss": 0.12055267622219992, "eval_runtime": 160.8906, "eval_samples_per_second": 24.862, "eval_steps_per_second": 0.392, "step": 480 }, { "epoch": 0.049, "grad_norm": 0.02353891357779503, "learning_rate": 4.850727780681685e-05, "loss": 0.121, "step": 490 }, { "epoch": 0.049, "eval_cos_sim": 0.8802867531776428, "eval_loss": 0.12067857982861471, "eval_runtime": 162.0814, "eval_samples_per_second": 24.679, "eval_steps_per_second": 0.389, "step": 490 }, { "epoch": 0.05, "grad_norm": 0.03163010999560356, "learning_rate": 1.6305430936700462e-05, "loss": 0.1206, "step": 500 }, { "epoch": 0.05, "eval_cos_sim": 0.8799078464508057, "eval_loss": 0.12105564882504416, "eval_runtime": 159.7041, "eval_samples_per_second": 25.046, "eval_steps_per_second": 0.394, "step": 500 }, { "epoch": 0.051, "grad_norm": 0.03480914607644081, "learning_rate": 4.8370018959339916e-05, "loss": 0.1193, "step": 510 }, { "epoch": 0.051, "eval_cos_sim": 0.8801398873329163, "eval_loss": 0.12082168438183737, "eval_runtime": 168.1565, "eval_samples_per_second": 23.787, "eval_steps_per_second": 0.375, "step": 510 }, { "epoch": 0.052, "grad_norm": 0.031566403806209564, "learning_rate": 1.5936494668034417e-05, "loss": 0.1207, "step": 520 }, { "epoch": 0.052, "eval_cos_sim": 0.8804723024368286, "eval_loss": 0.12048741771923971, "eval_runtime": 162.0779, "eval_samples_per_second": 24.679, "eval_steps_per_second": 0.389, "step": 520 }, { "epoch": 0.053, "grad_norm": 0.02134857140481472, "learning_rate": 4.822693581319333e-05, "loss": 0.1207, "step": 530 }, { "epoch": 0.053, "eval_cos_sim": 0.8804084062576294, "eval_loss": 0.12055278637158347, "eval_runtime": 165.0027, "eval_samples_per_second": 24.242, "eval_steps_per_second": 0.382, "step": 530 }, { "epoch": 0.054, "grad_norm": 0.02998766116797924, "learning_rate": 1.5569817214910634e-05, "loss": 0.1206, "step": 540 }, { "epoch": 0.054, "eval_cos_sim": 0.879996657371521, "eval_loss": 0.12096652509915305, "eval_runtime": 269.6523, "eval_samples_per_second": 14.834, "eval_steps_per_second": 0.234, "step": 540 }, { "epoch": 0.055, "grad_norm": 0.023394938558340073, "learning_rate": 4.807806402769648e-05, "loss": 0.1204, "step": 550 }, { "epoch": 0.055, "eval_cos_sim": 0.8802942037582397, "eval_loss": 0.12066655971753074, "eval_runtime": 241.2043, "eval_samples_per_second": 16.583, "eval_steps_per_second": 0.261, "step": 550 }, { "epoch": 0.056, "grad_norm": 0.04035342484712601, "learning_rate": 1.520548996103771e-05, "loss": 0.1208, "step": 560 }, { "epoch": 0.056, "eval_cos_sim": 0.8805367946624756, "eval_loss": 0.12042092802273703, "eval_runtime": 217.8859, "eval_samples_per_second": 18.358, "eval_steps_per_second": 0.289, "step": 560 }, { "epoch": 0.057, "grad_norm": 0.02704194188117981, "learning_rate": 4.7923440704819685e-05, "loss": 0.1205, "step": 570 }, { "epoch": 0.057, "eval_cos_sim": 0.8805016875267029, "eval_loss": 0.12045616819607688, "eval_runtime": 163.5639, "eval_samples_per_second": 24.455, "eval_steps_per_second": 0.385, "step": 570 }, { "epoch": 0.058, "grad_norm": 0.041525471955537796, "learning_rate": 1.4843603704405321e-05, "loss": 0.1209, "step": 580 }, { "epoch": 0.058, "eval_cos_sim": 0.8803950548171997, "eval_loss": 0.12056205751645041, "eval_runtime": 163.8454, "eval_samples_per_second": 24.413, "eval_steps_per_second": 0.385, "step": 580 }, { "epoch": 0.059, "grad_norm": 0.02588295191526413, "learning_rate": 4.7763104379936636e-05, "loss": 0.12, "step": 590 }, { "epoch": 0.059, "eval_cos_sim": 0.8804982304573059, "eval_loss": 0.12045794346081687, "eval_runtime": 175.2999, "eval_samples_per_second": 22.818, "eval_steps_per_second": 0.359, "step": 590 }, { "epoch": 0.06, "grad_norm": 0.030644405633211136, "learning_rate": 1.4484248634655188e-05, "loss": 0.1211, "step": 600 }, { "epoch": 0.06, "eval_cos_sim": 0.8804518580436707, "eval_loss": 0.12050184681164694, "eval_runtime": 165.1748, "eval_samples_per_second": 24.217, "eval_steps_per_second": 0.381, "step": 600 }, { "epoch": 0.061, "grad_norm": 0.03162102401256561, "learning_rate": 4.7597095012220556e-05, "loss": 0.1194, "step": 610 }, { "epoch": 0.061, "eval_cos_sim": 0.8805278539657593, "eval_loss": 0.12042546226727438, "eval_runtime": 169.0168, "eval_samples_per_second": 23.666, "eval_steps_per_second": 0.373, "step": 610 }, { "epoch": 0.062, "grad_norm": 0.030952898785471916, "learning_rate": 1.4127514310605238e-05, "loss": 0.1202, "step": 620 }, { "epoch": 0.062, "eval_cos_sim": 0.880526602268219, "eval_loss": 0.12043014385449362, "eval_runtime": 161.73, "eval_samples_per_second": 24.733, "eval_steps_per_second": 0.39, "step": 620 }, { "epoch": 0.063, "grad_norm": 0.025900695472955704, "learning_rate": 4.742545397468656e-05, "loss": 0.1205, "step": 630 }, { "epoch": 0.063, "eval_cos_sim": 0.8804138898849487, "eval_loss": 0.12054368068921043, "eval_runtime": 169.1165, "eval_samples_per_second": 23.652, "eval_steps_per_second": 0.373, "step": 630 }, { "epoch": 0.064, "grad_norm": 0.02121679112315178, "learning_rate": 1.3773489637927061e-05, "loss": 0.1208, "step": 640 }, { "epoch": 0.064, "eval_cos_sim": 0.8801329731941223, "eval_loss": 0.120824953577394, "eval_runtime": 161.444, "eval_samples_per_second": 24.776, "eval_steps_per_second": 0.39, "step": 640 }, { "epoch": 0.065, "grad_norm": 0.02153482660651207, "learning_rate": 4.7248224043879605e-05, "loss": 0.1211, "step": 650 }, { "epoch": 0.065, "eval_cos_sim": 0.8802359104156494, "eval_loss": 0.12072229530560447, "eval_runtime": 161.7761, "eval_samples_per_second": 24.726, "eval_steps_per_second": 0.389, "step": 650 }, { "epoch": 0.066, "grad_norm": 0.030305592343211174, "learning_rate": 1.342226284699112e-05, "loss": 0.1202, "step": 660 }, { "epoch": 0.066, "eval_cos_sim": 0.8804138898849487, "eval_loss": 0.12054470445858909, "eval_runtime": 166.8157, "eval_samples_per_second": 23.979, "eval_steps_per_second": 0.378, "step": 660 }, { "epoch": 0.067, "grad_norm": 0.03320358693599701, "learning_rate": 4.7065449389213644e-05, "loss": 0.1216, "step": 670 }, { "epoch": 0.067, "eval_cos_sim": 0.8803730607032776, "eval_loss": 0.12058561565625144, "eval_runtime": 168.9555, "eval_samples_per_second": 23.675, "eval_steps_per_second": 0.373, "step": 670 }, { "epoch": 0.068, "grad_norm": 0.03419356420636177, "learning_rate": 1.3073921470878081e-05, "loss": 0.1197, "step": 680 }, { "epoch": 0.068, "eval_cos_sim": 0.8803737163543701, "eval_loss": 0.12058660843121481, "eval_runtime": 162.073, "eval_samples_per_second": 24.68, "eval_steps_per_second": 0.389, "step": 680 }, { "epoch": 0.069, "grad_norm": 0.022392097860574722, "learning_rate": 4.6877175561964684e-05, "loss": 0.12, "step": 690 }, { "epoch": 0.069, "eval_cos_sim": 0.8804360628128052, "eval_loss": 0.12052560474621725, "eval_runtime": 184.9748, "eval_samples_per_second": 21.625, "eval_steps_per_second": 0.341, "step": 690 }, { "epoch": 0.07, "grad_norm": 0.03007390908896923, "learning_rate": 1.272855232356e-05, "loss": 0.1204, "step": 700 }, { "epoch": 0.07, "eval_cos_sim": 0.8805059790611267, "eval_loss": 0.12045389034497214, "eval_runtime": 165.5078, "eval_samples_per_second": 24.168, "eval_steps_per_second": 0.381, "step": 700 }, { "epoch": 0.071, "grad_norm": 0.020742209628224373, "learning_rate": 4.6683449483917846e-05, "loss": 0.12, "step": 710 }, { "epoch": 0.071, "eval_cos_sim": 0.8806586861610413, "eval_loss": 0.12030088522183371, "eval_runtime": 172.1855, "eval_samples_per_second": 23.231, "eval_steps_per_second": 0.366, "step": 710 }, { "epoch": 0.072, "grad_norm": 0.023063719272613525, "learning_rate": 1.2386241478270652e-05, "loss": 0.1198, "step": 720 }, { "epoch": 0.072, "eval_cos_sim": 0.8805540204048157, "eval_loss": 0.12040612079846334, "eval_runtime": 166.7801, "eval_samples_per_second": 23.984, "eval_steps_per_second": 0.378, "step": 720 }, { "epoch": 0.073, "grad_norm": 0.027647124603390694, "learning_rate": 4.648431943567264e-05, "loss": 0.1205, "step": 730 }, { "epoch": 0.073, "eval_cos_sim": 0.8805664777755737, "eval_loss": 0.12039038088070822, "eval_runtime": 164.5025, "eval_samples_per_second": 24.316, "eval_steps_per_second": 0.383, "step": 730 }, { "epoch": 0.074, "grad_norm": 0.02208826318383217, "learning_rate": 1.204707424604792e-05, "loss": 0.1203, "step": 740 }, { "epoch": 0.074, "eval_cos_sim": 0.8805152177810669, "eval_loss": 0.12043928005444479, "eval_runtime": 178.0321, "eval_samples_per_second": 22.468, "eval_steps_per_second": 0.354, "step": 740 }, { "epoch": 0.075, "grad_norm": 0.021549325436353683, "learning_rate": 4.627983504461235e-05, "loss": 0.1196, "step": 750 }, { "epoch": 0.075, "eval_cos_sim": 0.8806087374687195, "eval_loss": 0.12034820940243673, "eval_runtime": 179.3479, "eval_samples_per_second": 22.303, "eval_steps_per_second": 0.351, "step": 750 }, { "epoch": 0.076, "grad_norm": 0.028022369369864464, "learning_rate": 1.1711135154477562e-05, "loss": 0.1207, "step": 760 }, { "epoch": 0.076, "eval_cos_sim": 0.8805749416351318, "eval_loss": 0.12038306286084127, "eval_runtime": 176.8723, "eval_samples_per_second": 22.615, "eval_steps_per_second": 0.356, "step": 760 }, { "epoch": 0.077, "grad_norm": 0.021017303690314293, "learning_rate": 4.607004727253391e-05, "loss": 0.12, "step": 770 }, { "epoch": 0.077, "eval_cos_sim": 0.8806374669075012, "eval_loss": 0.12032015850293112, "eval_runtime": 163.1866, "eval_samples_per_second": 24.512, "eval_steps_per_second": 0.386, "step": 770 }, { "epoch": 0.078, "grad_norm": 0.02246786840260029, "learning_rate": 1.1378507926623572e-05, "loss": 0.1199, "step": 780 }, { "epoch": 0.078, "eval_cos_sim": 0.8806332349777222, "eval_loss": 0.12032350780713034, "eval_runtime": 163.0562, "eval_samples_per_second": 24.531, "eval_steps_per_second": 0.386, "step": 780 }, { "epoch": 0.079, "grad_norm": 0.021708086133003235, "learning_rate": 4.585500840294793e-05, "loss": 0.1201, "step": 790 }, { "epoch": 0.079, "eval_cos_sim": 0.8807349801063538, "eval_loss": 0.12022322513806295, "eval_runtime": 173.5217, "eval_samples_per_second": 23.052, "eval_steps_per_second": 0.363, "step": 790 }, { "epoch": 0.08, "grad_norm": 0.034823887050151825, "learning_rate": 1.1049275460164102e-05, "loss": 0.1204, "step": 800 }, { "epoch": 0.08, "eval_cos_sim": 0.8806157112121582, "eval_loss": 0.12034289600598289, "eval_runtime": 163.4003, "eval_samples_per_second": 24.48, "eval_steps_per_second": 0.386, "step": 800 }, { "epoch": 0.081, "grad_norm": 0.02099907584488392, "learning_rate": 4.563477202804924e-05, "loss": 0.1203, "step": 810 }, { "epoch": 0.081, "eval_cos_sim": 0.8805558681488037, "eval_loss": 0.1204009156440444, "eval_runtime": 171.4073, "eval_samples_per_second": 23.336, "eval_steps_per_second": 0.368, "step": 810 }, { "epoch": 0.082, "grad_norm": 0.027718910947442055, "learning_rate": 1.0723519806732512e-05, "loss": 0.1206, "step": 820 }, { "epoch": 0.082, "eval_cos_sim": 0.8804323673248291, "eval_loss": 0.12052262928235007, "eval_runtime": 166.5171, "eval_samples_per_second": 24.022, "eval_steps_per_second": 0.378, "step": 820 }, { "epoch": 0.083, "grad_norm": 0.030117569491267204, "learning_rate": 4.540939303535997e-05, "loss": 0.1208, "step": 830 }, { "epoch": 0.083, "eval_cos_sim": 0.8806024193763733, "eval_loss": 0.12035309459912252, "eval_runtime": 172.9809, "eval_samples_per_second": 23.124, "eval_steps_per_second": 0.364, "step": 830 }, { "epoch": 0.084, "grad_norm": 0.025621019303798676, "learning_rate": 1.0401322151467458e-05, "loss": 0.1207, "step": 840 }, { "epoch": 0.084, "eval_cos_sim": 0.8805838823318481, "eval_loss": 0.12037316419827414, "eval_runtime": 161.0298, "eval_samples_per_second": 24.84, "eval_steps_per_second": 0.391, "step": 840 }, { "epoch": 0.085, "grad_norm": 0.037043701857328415, "learning_rate": 4.517892759404947e-05, "loss": 0.1192, "step": 850 }, { "epoch": 0.085, "eval_cos_sim": 0.8807790279388428, "eval_loss": 0.12017762710797263, "eval_runtime": 165.404, "eval_samples_per_second": 24.183, "eval_steps_per_second": 0.381, "step": 850 }, { "epoch": 0.086, "grad_norm": 0.024647973477840424, "learning_rate": 1.0082762792778497e-05, "loss": 0.12, "step": 860 }, { "epoch": 0.086, "eval_cos_sim": 0.8808472156524658, "eval_loss": 0.12010916282879783, "eval_runtime": 159.4806, "eval_samples_per_second": 25.081, "eval_steps_per_second": 0.395, "step": 860 }, { "epoch": 0.087, "grad_norm": 0.02787039987742901, "learning_rate": 4.494343314093799e-05, "loss": 0.1192, "step": 870 }, { "epoch": 0.087, "eval_cos_sim": 0.8806514143943787, "eval_loss": 0.12030471136319112, "eval_runtime": 167.7185, "eval_samples_per_second": 23.849, "eval_steps_per_second": 0.376, "step": 870 }, { "epoch": 0.088, "grad_norm": 0.027198661118745804, "learning_rate": 9.767921122337203e-06, "loss": 0.12, "step": 880 }, { "epoch": 0.088, "eval_cos_sim": 0.8806140422821045, "eval_loss": 0.12034183456646871, "eval_runtime": 164.5125, "eval_samples_per_second": 24.314, "eval_steps_per_second": 0.383, "step": 880 }, { "epoch": 0.089, "grad_norm": 0.020295780152082443, "learning_rate": 4.4702968366179995e-05, "loss": 0.121, "step": 890 }, { "epoch": 0.089, "eval_cos_sim": 0.8807177543640137, "eval_loss": 0.12023961307751609, "eval_runtime": 178.9283, "eval_samples_per_second": 22.355, "eval_steps_per_second": 0.352, "step": 890 }, { "epoch": 0.09, "grad_norm": 0.025682412087917328, "learning_rate": 9.456875605287963e-06, "loss": 0.1197, "step": 900 }, { "epoch": 0.09, "eval_cos_sim": 0.8808146715164185, "eval_loss": 0.12014213421093893, "eval_runtime": 162.1503, "eval_samples_per_second": 24.668, "eval_steps_per_second": 0.389, "step": 900 }, { "epoch": 0.091, "grad_norm": 0.0241321362555027, "learning_rate": 4.4457593198638266e-05, "loss": 0.1204, "step": 910 }, { "epoch": 0.091, "eval_cos_sim": 0.8806710243225098, "eval_loss": 0.12028472664105368, "eval_runtime": 170.9864, "eval_samples_per_second": 23.394, "eval_steps_per_second": 0.368, "step": 910 }, { "epoch": 0.092, "grad_norm": 0.03511843457818031, "learning_rate": 9.149703760693733e-06, "loss": 0.1204, "step": 920 }, { "epoch": 0.092, "eval_cos_sim": 0.8806130886077881, "eval_loss": 0.12034211971508932, "eval_runtime": 163.6776, "eval_samples_per_second": 24.438, "eval_steps_per_second": 0.385, "step": 920 }, { "epoch": 0.093, "grad_norm": 0.03159726411104202, "learning_rate": 4.420736879094911e-05, "loss": 0.1208, "step": 930 }, { "epoch": 0.093, "eval_cos_sim": 0.8807392716407776, "eval_loss": 0.12021884395825339, "eval_runtime": 172.0497, "eval_samples_per_second": 23.249, "eval_steps_per_second": 0.366, "step": 930 }, { "epoch": 0.094, "grad_norm": 0.02288082055747509, "learning_rate": 8.846482142219678e-06, "loss": 0.1206, "step": 940 }, { "epoch": 0.094, "eval_cos_sim": 0.8808532953262329, "eval_loss": 0.12010584022747946, "eval_runtime": 160.6917, "eval_samples_per_second": 24.892, "eval_steps_per_second": 0.392, "step": 940 }, { "epoch": 0.095, "grad_norm": 0.022692304104566574, "learning_rate": 4.395235750428116e-05, "loss": 0.1193, "step": 950 }, { "epoch": 0.095, "eval_cos_sim": 0.8807929158210754, "eval_loss": 0.1201639943336196, "eval_runtime": 165.9011, "eval_samples_per_second": 24.111, "eval_steps_per_second": 0.38, "step": 950 }, { "epoch": 0.096, "grad_norm": 0.02069064788520336, "learning_rate": 8.547286319049193e-06, "loss": 0.1204, "step": 960 }, { "epoch": 0.096, "eval_cos_sim": 0.8806983232498169, "eval_loss": 0.12025791980969382, "eval_runtime": 163.2536, "eval_samples_per_second": 24.502, "eval_steps_per_second": 0.386, "step": 960 }, { "epoch": 0.097, "grad_norm": 0.024405937641859055, "learning_rate": 4.369262289279257e-05, "loss": 0.12, "step": 970 }, { "epoch": 0.097, "eval_cos_sim": 0.8808521628379822, "eval_loss": 0.1201059584830947, "eval_runtime": 161.1408, "eval_samples_per_second": 24.823, "eval_steps_per_second": 0.391, "step": 970 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 440, "trial_name": null, "trial_params": null }