{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9981298423724285, "eval_steps": 200, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021373230029388193, "grad_norm": 0.0, "learning_rate": 0.0, "logits": -2.7276527881622314, "logps": -123.19757843017578, "loss": 0.6931, "step": 1 }, { "epoch": 0.010686615014694095, "grad_norm": 27.89530248586404, "learning_rate": 1.0638297872340425e-08, "logits": -2.8715224266052246, "logps": -234.59034729003906, "loss": 0.6931, "step": 5 }, { "epoch": 0.02137323002938819, "grad_norm": 24.041320691027245, "learning_rate": 6.382978723404254e-08, "logits": -2.8461320400238037, "logps": -248.1672821044922, "loss": 0.6928, "step": 10 }, { "epoch": 0.03205984504408229, "grad_norm": 24.00395488621066, "learning_rate": 1.0638297872340425e-07, "logits": -2.780062437057495, "logps": -229.3790740966797, "loss": 0.683, "step": 15 }, { "epoch": 0.04274646005877638, "grad_norm": 19.287203304415307, "learning_rate": 1.4893617021276595e-07, "logits": -2.772031545639038, "logps": -204.7851104736328, "loss": 0.6589, "step": 20 }, { "epoch": 0.053433075073470476, "grad_norm": 16.506203912380084, "learning_rate": 2.0212765957446807e-07, "logits": -2.9439215660095215, "logps": -291.8533630371094, "loss": 0.5956, "step": 25 }, { "epoch": 0.06411969008816458, "grad_norm": 13.779319784342116, "learning_rate": 2.4468085106382976e-07, "logits": -2.911271572113037, "logps": -281.06744384765625, "loss": 0.5498, "step": 30 }, { "epoch": 0.07480630510285867, "grad_norm": 12.4154908271174, "learning_rate": 2.978723404255319e-07, "logits": -2.8849587440490723, "logps": -235.69473266601562, "loss": 0.5207, "step": 35 }, { "epoch": 0.08549292011755276, "grad_norm": 13.748382495294905, "learning_rate": 3.5106382978723405e-07, "logits": -2.877370595932007, "logps": -235.3643035888672, "loss": 0.4912, "step": 40 }, { "epoch": 0.09617953513224686, "grad_norm": 13.504025107902667, "learning_rate": 4.0425531914893614e-07, "logits": -2.7878780364990234, "logps": -260.6932678222656, "loss": 0.4916, "step": 45 }, { "epoch": 0.10686615014694095, "grad_norm": 14.324493553144778, "learning_rate": 4.574468085106383e-07, "logits": -2.603867292404175, "logps": -273.16552734375, "loss": 0.4721, "step": 50 }, { "epoch": 0.11755276516163506, "grad_norm": 15.40742501299336, "learning_rate": 4.999930062653174e-07, "logits": -2.7843973636627197, "logps": -288.14190673828125, "loss": 0.4515, "step": 55 }, { "epoch": 0.12823938017632916, "grad_norm": 14.561448001055401, "learning_rate": 4.997482666353286e-07, "logits": -2.647210121154785, "logps": -257.77447509765625, "loss": 0.4475, "step": 60 }, { "epoch": 0.13892599519102325, "grad_norm": 15.068250536743896, "learning_rate": 4.991542314714122e-07, "logits": -2.5821423530578613, "logps": -303.4721984863281, "loss": 0.4282, "step": 65 }, { "epoch": 0.14961261020571734, "grad_norm": 16.720167595176566, "learning_rate": 4.982117315854593e-07, "logits": -2.341573476791382, "logps": -270.63189697265625, "loss": 0.4396, "step": 70 }, { "epoch": 0.16029922522041143, "grad_norm": 13.024576148545005, "learning_rate": 4.969220851487844e-07, "logits": -2.181319236755371, "logps": -271.0784912109375, "loss": 0.4102, "step": 75 }, { "epoch": 0.17098584023510552, "grad_norm": 14.533980847784468, "learning_rate": 4.952870958485431e-07, "logits": -2.5626049041748047, "logps": -315.2617492675781, "loss": 0.4169, "step": 80 }, { "epoch": 0.18167245524979964, "grad_norm": 20.372388284984453, "learning_rate": 4.933090503651128e-07, "logits": -2.1712753772735596, "logps": -308.96380615234375, "loss": 0.4085, "step": 85 }, { "epoch": 0.19235907026449373, "grad_norm": 16.99275205662673, "learning_rate": 4.909907151739633e-07, "logits": -2.3788561820983887, "logps": -242.3260498046875, "loss": 0.4102, "step": 90 }, { "epoch": 0.20304568527918782, "grad_norm": 15.680297362409974, "learning_rate": 4.883353326764906e-07, "logits": -2.041024684906006, "logps": -308.76361083984375, "loss": 0.4059, "step": 95 }, { "epoch": 0.2137323002938819, "grad_norm": 16.614616612784232, "learning_rate": 4.853466166652258e-07, "logits": -2.2316627502441406, "logps": -250.2170867919922, "loss": 0.3966, "step": 100 }, { "epoch": 0.224418915308576, "grad_norm": 15.24947145329996, "learning_rate": 4.820287471297597e-07, "logits": -2.2991251945495605, "logps": -299.30413818359375, "loss": 0.3826, "step": 105 }, { "epoch": 0.2351055303232701, "grad_norm": 13.727683778137456, "learning_rate": 4.783863644106502e-07, "logits": -2.3693361282348633, "logps": -316.0078125, "loss": 0.3928, "step": 110 }, { "epoch": 0.2457921453379642, "grad_norm": 14.648764803231707, "learning_rate": 4.744245627094858e-07, "logits": -2.2451415061950684, "logps": -289.3954772949219, "loss": 0.4144, "step": 115 }, { "epoch": 0.2564787603526583, "grad_norm": 14.654511458237025, "learning_rate": 4.7014888296418447e-07, "logits": -2.1494853496551514, "logps": -273.82159423828125, "loss": 0.4048, "step": 120 }, { "epoch": 0.2671653753673524, "grad_norm": 14.029747414365776, "learning_rate": 4.655653050994906e-07, "logits": -2.2021608352661133, "logps": -295.5478515625, "loss": 0.3795, "step": 125 }, { "epoch": 0.2778519903820465, "grad_norm": 14.629974380430687, "learning_rate": 4.606802396635098e-07, "logits": -2.278817653656006, "logps": -288.4320983886719, "loss": 0.4076, "step": 130 }, { "epoch": 0.2885386053967406, "grad_norm": 15.667008536793405, "learning_rate": 4.555005188619775e-07, "logits": -2.370594024658203, "logps": -278.10565185546875, "loss": 0.3922, "step": 135 }, { "epoch": 0.2992252204114347, "grad_norm": 16.537626095297334, "learning_rate": 4.500333870028016e-07, "logits": -2.296696424484253, "logps": -314.9455871582031, "loss": 0.3805, "step": 140 }, { "epoch": 0.30991183542612877, "grad_norm": 14.136931000002011, "learning_rate": 4.442864903642427e-07, "logits": -1.9875481128692627, "logps": -321.88336181640625, "loss": 0.3807, "step": 145 }, { "epoch": 0.32059845044082286, "grad_norm": 17.66764816927005, "learning_rate": 4.3826786650090273e-07, "logits": -2.300191879272461, "logps": -288.90155029296875, "loss": 0.3962, "step": 150 }, { "epoch": 0.33128506545551695, "grad_norm": 13.966542226658245, "learning_rate": 4.319859330024777e-07, "logits": -2.366628408432007, "logps": -320.36199951171875, "loss": 0.3882, "step": 155 }, { "epoch": 0.34197168047021104, "grad_norm": 15.005833930316435, "learning_rate": 4.254494757209979e-07, "logits": -2.2027194499969482, "logps": -344.4361267089844, "loss": 0.3849, "step": 160 }, { "epoch": 0.3526582954849052, "grad_norm": 18.876637586071404, "learning_rate": 4.186676364830186e-07, "logits": -2.357083559036255, "logps": -315.52972412109375, "loss": 0.3865, "step": 165 }, { "epoch": 0.36334491049959927, "grad_norm": 15.883791751426108, "learning_rate": 4.1164990030394985e-07, "logits": -2.214961051940918, "logps": -286.7484130859375, "loss": 0.383, "step": 170 }, { "epoch": 0.37403152551429336, "grad_norm": 15.570319458411793, "learning_rate": 4.0440608212240445e-07, "logits": -2.1734325885772705, "logps": -290.1646728515625, "loss": 0.3771, "step": 175 }, { "epoch": 0.38471814052898745, "grad_norm": 16.13327248480777, "learning_rate": 3.9694631307311825e-07, "logits": -2.2254865169525146, "logps": -303.46368408203125, "loss": 0.3899, "step": 180 }, { "epoch": 0.39540475554368154, "grad_norm": 13.968633194350911, "learning_rate": 3.8928102631764304e-07, "logits": -2.254255771636963, "logps": -246.87158203125, "loss": 0.3705, "step": 185 }, { "epoch": 0.40609137055837563, "grad_norm": 16.518660551756426, "learning_rate": 3.8142094245262615e-07, "logits": -2.283003568649292, "logps": -298.9637451171875, "loss": 0.3755, "step": 190 }, { "epoch": 0.4167779855730697, "grad_norm": 17.34857995588876, "learning_rate": 3.7337705451608667e-07, "logits": -2.154602289199829, "logps": -354.7010192871094, "loss": 0.3748, "step": 195 }, { "epoch": 0.4274646005877638, "grad_norm": 15.365894431623499, "learning_rate": 3.6516061261265805e-07, "logits": -2.255361795425415, "logps": -273.0242004394531, "loss": 0.3733, "step": 200 }, { "epoch": 0.4274646005877638, "eval_logits": -2.2792365550994873, "eval_logps": -309.9268493652344, "eval_loss": 0.3719645142555237, "eval_runtime": 491.0681, "eval_samples_per_second": 4.008, "eval_steps_per_second": 0.25, "step": 200 }, { "epoch": 0.4381512156024579, "grad_norm": 15.070320151495675, "learning_rate": 3.567831081792992e-07, "logits": -2.1955361366271973, "logps": -286.1488037109375, "loss": 0.3799, "step": 205 }, { "epoch": 0.448837830617152, "grad_norm": 14.900452812607735, "learning_rate": 3.482562579134809e-07, "logits": -2.0645949840545654, "logps": -300.73236083984375, "loss": 0.3727, "step": 210 }, { "epoch": 0.45952444563184613, "grad_norm": 16.05224888818079, "learning_rate": 3.39591987386325e-07, "logits": -1.9900414943695068, "logps": -282.0771484375, "loss": 0.3582, "step": 215 }, { "epoch": 0.4702110606465402, "grad_norm": 15.03367961373721, "learning_rate": 3.30802414363615e-07, "logits": -1.9547094106674194, "logps": -248.65402221679688, "loss": 0.3634, "step": 220 }, { "epoch": 0.4808976756612343, "grad_norm": 18.773649100686807, "learning_rate": 3.218998318580043e-07, "logits": -2.17350172996521, "logps": -326.99517822265625, "loss": 0.367, "step": 225 }, { "epoch": 0.4915842906759284, "grad_norm": 14.562679231504998, "learning_rate": 3.128966909361271e-07, "logits": -2.153386116027832, "logps": -335.359375, "loss": 0.3683, "step": 230 }, { "epoch": 0.5022709056906225, "grad_norm": 13.853727892905015, "learning_rate": 3.038055833046555e-07, "logits": -2.035804510116577, "logps": -272.8970642089844, "loss": 0.3716, "step": 235 }, { "epoch": 0.5129575207053166, "grad_norm": 15.090711014568011, "learning_rate": 2.9463922369965915e-07, "logits": -1.9920990467071533, "logps": -318.5932922363281, "loss": 0.3686, "step": 240 }, { "epoch": 0.5236441357200107, "grad_norm": 16.18137909043194, "learning_rate": 2.8541043210389726e-07, "logits": -2.217284679412842, "logps": -294.2337341308594, "loss": 0.3545, "step": 245 }, { "epoch": 0.5343307507347048, "grad_norm": 14.063753919982574, "learning_rate": 2.761321158169134e-07, "logits": -2.3281540870666504, "logps": -285.443359375, "loss": 0.3574, "step": 250 }, { "epoch": 0.5450173657493989, "grad_norm": 17.098897367043495, "learning_rate": 2.6681725140300995e-07, "logits": -1.7651288509368896, "logps": -297.5621032714844, "loss": 0.3564, "step": 255 }, { "epoch": 0.555703980764093, "grad_norm": 16.412107745592355, "learning_rate": 2.574788665423496e-07, "logits": -1.856030821800232, "logps": -297.8916320800781, "loss": 0.3588, "step": 260 }, { "epoch": 0.566390595778787, "grad_norm": 14.888086982411561, "learning_rate": 2.4813002181056676e-07, "logits": -2.086013078689575, "logps": -289.2059020996094, "loss": 0.3562, "step": 265 }, { "epoch": 0.5770772107934812, "grad_norm": 20.34056135034251, "learning_rate": 2.3878379241237134e-07, "logits": -1.7992274761199951, "logps": -286.0703125, "loss": 0.3674, "step": 270 }, { "epoch": 0.5877638258081752, "grad_norm": 16.00937961787345, "learning_rate": 2.2945324989469243e-07, "logits": -2.1212961673736572, "logps": -294.78125, "loss": 0.3583, "step": 275 }, { "epoch": 0.5984504408228694, "grad_norm": 15.0543607024416, "learning_rate": 2.2015144386493895e-07, "logits": -1.5599911212921143, "logps": -331.1915588378906, "loss": 0.3612, "step": 280 }, { "epoch": 0.6091370558375635, "grad_norm": 15.738762756418016, "learning_rate": 2.1089138373994222e-07, "logits": -1.6524254083633423, "logps": -275.34027099609375, "loss": 0.3517, "step": 285 }, { "epoch": 0.6198236708522575, "grad_norm": 14.233606209222401, "learning_rate": 2.0168602055111173e-07, "logits": -1.846451997756958, "logps": -323.7337341308594, "loss": 0.3594, "step": 290 }, { "epoch": 0.6305102858669517, "grad_norm": 14.831569367257195, "learning_rate": 1.9254822883124517e-07, "logits": -1.5174415111541748, "logps": -268.7288818359375, "loss": 0.3556, "step": 295 }, { "epoch": 0.6411969008816457, "grad_norm": 14.671373291294442, "learning_rate": 1.8349078860833124e-07, "logits": -1.7903592586517334, "logps": -292.779052734375, "loss": 0.3559, "step": 300 }, { "epoch": 0.6518835158963399, "grad_norm": 15.705427082152443, "learning_rate": 1.745263675315245e-07, "logits": -1.7898918390274048, "logps": -310.0693664550781, "loss": 0.3571, "step": 305 }, { "epoch": 0.6625701309110339, "grad_norm": 14.233332865288965, "learning_rate": 1.656675031542925e-07, "logits": -1.736101508140564, "logps": -288.0835266113281, "loss": 0.3618, "step": 310 }, { "epoch": 0.673256745925728, "grad_norm": 13.101127579355996, "learning_rate": 1.569265853995137e-07, "logits": -2.0390021800994873, "logps": -329.4677429199219, "loss": 0.3578, "step": 315 }, { "epoch": 0.6839433609404221, "grad_norm": 15.898783980322763, "learning_rate": 1.4831583923104998e-07, "logits": -1.9800916910171509, "logps": -278.5652770996094, "loss": 0.3391, "step": 320 }, { "epoch": 0.6946299759551162, "grad_norm": 15.179976349180745, "learning_rate": 1.3984730755602903e-07, "logits": -2.15975284576416, "logps": -331.96722412109375, "loss": 0.3488, "step": 325 }, { "epoch": 0.7053165909698104, "grad_norm": 16.173648063524812, "learning_rate": 1.3153283438175034e-07, "logits": -2.1058340072631836, "logps": -319.34527587890625, "loss": 0.3568, "step": 330 }, { "epoch": 0.7160032059845044, "grad_norm": 14.495401917181017, "learning_rate": 1.2338404825076935e-07, "logits": -1.7633529901504517, "logps": -351.7260437011719, "loss": 0.3397, "step": 335 }, { "epoch": 0.7266898209991985, "grad_norm": 14.748613870290693, "learning_rate": 1.1541234597732947e-07, "logits": -1.9439738988876343, "logps": -284.2515563964844, "loss": 0.3488, "step": 340 }, { "epoch": 0.7373764360138926, "grad_norm": 15.431597691399574, "learning_rate": 1.0762887670788701e-07, "logits": -2.0670387744903564, "logps": -324.03240966796875, "loss": 0.3568, "step": 345 }, { "epoch": 0.7480630510285867, "grad_norm": 13.916918536725055, "learning_rate": 1.0004452632802158e-07, "logits": -1.9829730987548828, "logps": -283.0121154785156, "loss": 0.3371, "step": 350 }, { "epoch": 0.7587496660432808, "grad_norm": 14.83244858797638, "learning_rate": 9.266990223754067e-08, "logits": -2.1220943927764893, "logps": -284.113525390625, "loss": 0.3572, "step": 355 }, { "epoch": 0.7694362810579749, "grad_norm": 13.906877690225956, "learning_rate": 8.551531851507185e-08, "logits": -1.8662292957305908, "logps": -316.2903747558594, "loss": 0.3534, "step": 360 }, { "epoch": 0.7801228960726689, "grad_norm": 16.021070732056423, "learning_rate": 7.859078149289144e-08, "logits": -2.0029776096343994, "logps": -290.8583068847656, "loss": 0.3611, "step": 365 }, { "epoch": 0.7908095110873631, "grad_norm": 13.954109703177394, "learning_rate": 7.190597576216384e-08, "logits": -1.896113634109497, "logps": -294.7978210449219, "loss": 0.349, "step": 370 }, { "epoch": 0.8014961261020572, "grad_norm": 18.103369920066683, "learning_rate": 6.547025062816486e-08, "logits": -1.7909294366836548, "logps": -298.8819885253906, "loss": 0.3567, "step": 375 }, { "epoch": 0.8121827411167513, "grad_norm": 12.482171705770575, "learning_rate": 5.929260703443337e-08, "logits": -1.713022232055664, "logps": -309.056396484375, "loss": 0.3468, "step": 380 }, { "epoch": 0.8228693561314454, "grad_norm": 15.966406254185237, "learning_rate": 5.338168497413756e-08, "logits": -1.485386610031128, "logps": -301.24560546875, "loss": 0.3568, "step": 385 }, { "epoch": 0.8335559711461394, "grad_norm": 14.935885056404505, "learning_rate": 4.774575140626316e-08, "logits": -1.68508780002594, "logps": -302.7105407714844, "loss": 0.3465, "step": 390 }, { "epoch": 0.8442425861608336, "grad_norm": 14.494342289383189, "learning_rate": 4.2392688693524055e-08, "logits": -1.780106782913208, "logps": -281.1724548339844, "loss": 0.3577, "step": 395 }, { "epoch": 0.8549292011755276, "grad_norm": 14.966163192699174, "learning_rate": 3.732998357816514e-08, "logits": -1.9449115991592407, "logps": -308.26251220703125, "loss": 0.355, "step": 400 }, { "epoch": 0.8549292011755276, "eval_logits": -1.9703269004821777, "eval_logps": -324.5088806152344, "eval_loss": 0.3534272313117981, "eval_runtime": 480.7572, "eval_samples_per_second": 4.094, "eval_steps_per_second": 0.256, "step": 400 }, { "epoch": 0.8656158161902218, "grad_norm": 14.52513199126684, "learning_rate": 3.256471671107616e-08, "logits": -1.9270665645599365, "logps": -320.7890319824219, "loss": 0.357, "step": 405 }, { "epoch": 0.8763024312049158, "grad_norm": 13.984563355291044, "learning_rate": 2.8103552748861475e-08, "logits": -1.7152255773544312, "logps": -300.7731628417969, "loss": 0.3598, "step": 410 }, { "epoch": 0.88698904621961, "grad_norm": 14.474719718248345, "learning_rate": 2.3952731032714973e-08, "logits": -1.8561521768569946, "logps": -279.46380615234375, "loss": 0.3509, "step": 415 }, { "epoch": 0.897675661234304, "grad_norm": 15.20686225812657, "learning_rate": 2.0118056862137354e-08, "logits": -2.1437106132507324, "logps": -296.0022277832031, "loss": 0.3596, "step": 420 }, { "epoch": 0.9083622762489981, "grad_norm": 13.705499471422334, "learning_rate": 1.6604893375699592e-08, "logits": -1.8986858129501343, "logps": -294.5618896484375, "loss": 0.3566, "step": 425 }, { "epoch": 0.9190488912636923, "grad_norm": 14.224665908488847, "learning_rate": 1.3418154050208936e-08, "logits": -1.9494727849960327, "logps": -325.150634765625, "loss": 0.3432, "step": 430 }, { "epoch": 0.9297355062783863, "grad_norm": 13.456954319141683, "learning_rate": 1.0562295828767387e-08, "logits": -2.093982458114624, "logps": -293.69842529296875, "loss": 0.3565, "step": 435 }, { "epoch": 0.9404221212930804, "grad_norm": 13.391645385265695, "learning_rate": 8.041312887333396e-09, "logits": -1.904130220413208, "logps": -276.39105224609375, "loss": 0.3404, "step": 440 }, { "epoch": 0.9511087363077745, "grad_norm": 20.71139273996219, "learning_rate": 5.858731048505927e-09, "logits": -1.914345383644104, "logps": -338.41656494140625, "loss": 0.3573, "step": 445 }, { "epoch": 0.9617953513224686, "grad_norm": 19.97291791251005, "learning_rate": 4.0176028503425826e-09, "logits": -1.9096574783325195, "logps": -301.0379943847656, "loss": 0.355, "step": 450 }, { "epoch": 0.9724819663371627, "grad_norm": 14.314824993795222, "learning_rate": 2.5205032771092592e-09, "logits": -1.702121376991272, "logps": -298.85516357421875, "loss": 0.3377, "step": 455 }, { "epoch": 0.9831685813518568, "grad_norm": 13.935952369098977, "learning_rate": 1.3695261579316775e-09, "logits": -1.7297840118408203, "logps": -259.74139404296875, "loss": 0.3506, "step": 460 }, { "epoch": 0.9938551963665508, "grad_norm": 16.215817529965406, "learning_rate": 5.662812383859794e-10, "logits": -1.9168570041656494, "logps": -299.5458068847656, "loss": 0.3554, "step": 465 }, { "epoch": 0.9981298423724285, "step": 467, "total_flos": 0.0, "train_loss": 0.3941320441264412, "train_runtime": 39337.3566, "train_samples_per_second": 1.522, "train_steps_per_second": 0.012 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }