JoshuaKelleyDs's picture
dos
4a66dff verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 5000,
"global_step": 87900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11376564277588168,
"grad_norm": 4.130064964294434,
"learning_rate": 7.960000000000001e-05,
"loss": 5.5183,
"step": 1000
},
{
"epoch": 0.22753128555176336,
"grad_norm": 3.2718617916107178,
"learning_rate": 0.0001596,
"loss": 2.7046,
"step": 2000
},
{
"epoch": 0.3412969283276451,
"grad_norm": 3.3528096675872803,
"learning_rate": 0.0002396,
"loss": 1.8501,
"step": 3000
},
{
"epoch": 0.4550625711035267,
"grad_norm": 3.1067423820495605,
"learning_rate": 0.0003196,
"loss": 1.6051,
"step": 4000
},
{
"epoch": 0.5688282138794084,
"grad_norm": 2.107377052307129,
"learning_rate": 0.0003996,
"loss": 1.4934,
"step": 5000
},
{
"epoch": 0.5688282138794084,
"eval_accuracy": 0.644404,
"eval_loss": 1.4417771100997925,
"eval_runtime": 11.7201,
"eval_samples_per_second": 21330.792,
"eval_steps_per_second": 41.723,
"step": 5000
},
{
"epoch": 0.6825938566552902,
"grad_norm": 2.3725779056549072,
"learning_rate": 0.00047960000000000006,
"loss": 1.4263,
"step": 6000
},
{
"epoch": 0.7963594994311718,
"grad_norm": 1.8981488943099976,
"learning_rate": 0.0005596,
"loss": 1.3761,
"step": 7000
},
{
"epoch": 0.9101251422070534,
"grad_norm": 1.904260516166687,
"learning_rate": 0.0006396,
"loss": 1.339,
"step": 8000
},
{
"epoch": 1.023890784982935,
"grad_norm": 1.4444975852966309,
"learning_rate": 0.00071952,
"loss": 1.3071,
"step": 9000
},
{
"epoch": 1.1376564277588168,
"grad_norm": 1.3250641822814941,
"learning_rate": 0.00079952,
"loss": 1.2717,
"step": 10000
},
{
"epoch": 1.1376564277588168,
"eval_accuracy": 0.677084,
"eval_loss": 1.288106918334961,
"eval_runtime": 11.2915,
"eval_samples_per_second": 22140.537,
"eval_steps_per_second": 43.307,
"step": 10000
},
{
"epoch": 1.2514220705346986,
"grad_norm": 1.4477468729019165,
"learning_rate": 0.0007996786565611985,
"loss": 1.2557,
"step": 11000
},
{
"epoch": 1.36518771331058,
"grad_norm": 1.1922030448913574,
"learning_rate": 0.0007987086748436788,
"loss": 1.2356,
"step": 12000
},
{
"epoch": 1.4789533560864618,
"grad_norm": 1.2596089839935303,
"learning_rate": 0.0007970896788508052,
"loss": 1.2048,
"step": 13000
},
{
"epoch": 1.5927189988623436,
"grad_norm": 1.0349920988082886,
"learning_rate": 0.0007948275336376884,
"loss": 1.1905,
"step": 14000
},
{
"epoch": 1.7064846416382253,
"grad_norm": 1.2018927335739136,
"learning_rate": 0.0007919213896323948,
"loss": 1.1742,
"step": 15000
},
{
"epoch": 1.7064846416382253,
"eval_accuracy": 0.705248,
"eval_loss": 1.166063904762268,
"eval_runtime": 10.2238,
"eval_samples_per_second": 24452.808,
"eval_steps_per_second": 47.83,
"step": 15000
},
{
"epoch": 1.820250284414107,
"grad_norm": 1.0107839107513428,
"learning_rate": 0.0007883779147866073,
"loss": 1.1634,
"step": 16000
},
{
"epoch": 1.9340159271899886,
"grad_norm": 1.0561189651489258,
"learning_rate": 0.0007842028713931261,
"loss": 1.1485,
"step": 17000
},
{
"epoch": 2.04778156996587,
"grad_norm": 1.2642914056777954,
"learning_rate": 0.0007794030487826318,
"loss": 1.1177,
"step": 18000
},
{
"epoch": 2.161547212741752,
"grad_norm": 1.041416049003601,
"learning_rate": 0.0007739862522830791,
"loss": 1.0864,
"step": 19000
},
{
"epoch": 2.2753128555176336,
"grad_norm": 1.0880264043807983,
"learning_rate": 0.0007679612905269062,
"loss": 1.0846,
"step": 20000
},
{
"epoch": 2.2753128555176336,
"eval_accuracy": 0.717844,
"eval_loss": 1.1148858070373535,
"eval_runtime": 10.0405,
"eval_samples_per_second": 24899.239,
"eval_steps_per_second": 48.703,
"step": 20000
},
{
"epoch": 2.3890784982935154,
"grad_norm": 1.2247668504714966,
"learning_rate": 0.0007613448798360993,
"loss": 1.0832,
"step": 21000
},
{
"epoch": 2.502844141069397,
"grad_norm": 0.939128577709198,
"learning_rate": 0.0007541345353494786,
"loss": 1.0718,
"step": 22000
},
{
"epoch": 2.616609783845279,
"grad_norm": 0.8472806811332703,
"learning_rate": 0.0007463563776182788,
"loss": 1.0741,
"step": 23000
},
{
"epoch": 2.73037542662116,
"grad_norm": 0.9360683560371399,
"learning_rate": 0.000738007485475254,
"loss": 1.066,
"step": 24000
},
{
"epoch": 2.8441410693970424,
"grad_norm": 0.8565033674240112,
"learning_rate": 0.0007291456059015493,
"loss": 1.0619,
"step": 25000
},
{
"epoch": 2.8441410693970424,
"eval_accuracy": 0.726136,
"eval_loss": 1.077797293663025,
"eval_runtime": 10.346,
"eval_samples_per_second": 24163.966,
"eval_steps_per_second": 47.265,
"step": 25000
},
{
"epoch": 2.9579067121729237,
"grad_norm": 1.0906000137329102,
"learning_rate": 0.0007197139797510538,
"loss": 1.055,
"step": 26000
},
{
"epoch": 3.0716723549488054,
"grad_norm": 0.9203127026557922,
"learning_rate": 0.0007097624442901132,
"loss": 1.0186,
"step": 27000
},
{
"epoch": 3.185437997724687,
"grad_norm": 0.9751584529876709,
"learning_rate": 0.0006993071824080197,
"loss": 1.0015,
"step": 28000
},
{
"epoch": 3.299203640500569,
"grad_norm": 0.9114183187484741,
"learning_rate": 0.0006883651961389032,
"loss": 1.0015,
"step": 29000
},
{
"epoch": 3.4129692832764507,
"grad_norm": 0.8756843209266663,
"learning_rate": 0.0006769542790135331,
"loss": 1.0029,
"step": 30000
},
{
"epoch": 3.4129692832764507,
"eval_accuracy": 0.732152,
"eval_loss": 1.0556296110153198,
"eval_runtime": 10.3382,
"eval_samples_per_second": 24182.19,
"eval_steps_per_second": 47.3,
"step": 30000
},
{
"epoch": 3.526734926052332,
"grad_norm": 0.9515267014503479,
"learning_rate": 0.0006650929871240102,
"loss": 1.0071,
"step": 31000
},
{
"epoch": 3.640500568828214,
"grad_norm": 0.8560661673545837,
"learning_rate": 0.0006528131100577897,
"loss": 1.0052,
"step": 32000
},
{
"epoch": 3.7542662116040955,
"grad_norm": 0.7972965836524963,
"learning_rate": 0.0006401100359805646,
"loss": 0.9941,
"step": 33000
},
{
"epoch": 3.868031854379977,
"grad_norm": 0.8569052219390869,
"learning_rate": 0.0006270165021451055,
"loss": 0.9958,
"step": 34000
},
{
"epoch": 3.981797497155859,
"grad_norm": 0.8554688096046448,
"learning_rate": 0.0006135538008644762,
"loss": 0.9936,
"step": 35000
},
{
"epoch": 3.981797497155859,
"eval_accuracy": 0.737548,
"eval_loss": 1.0317354202270508,
"eval_runtime": 10.3557,
"eval_samples_per_second": 24141.212,
"eval_steps_per_second": 47.22,
"step": 35000
},
{
"epoch": 4.09556313993174,
"grad_norm": 0.8586387634277344,
"learning_rate": 0.0005997438247807972,
"loss": 0.9476,
"step": 36000
},
{
"epoch": 4.2093287827076225,
"grad_norm": 0.8404794335365295,
"learning_rate": 0.0005856233206956809,
"loss": 0.9405,
"step": 37000
},
{
"epoch": 4.323094425483504,
"grad_norm": 0.8823213577270508,
"learning_rate": 0.0005711869855083862,
"loss": 0.9397,
"step": 38000
},
{
"epoch": 4.436860068259386,
"grad_norm": 0.8017415404319763,
"learning_rate": 0.0005565019625838785,
"loss": 0.9469,
"step": 39000
},
{
"epoch": 4.550625711035267,
"grad_norm": 0.8456715941429138,
"learning_rate": 0.0005415332824531774,
"loss": 0.9429,
"step": 40000
},
{
"epoch": 4.550625711035267,
"eval_accuracy": 0.742428,
"eval_loss": 1.0149633884429932,
"eval_runtime": 10.3228,
"eval_samples_per_second": 24218.315,
"eval_steps_per_second": 47.371,
"step": 40000
},
{
"epoch": 4.664391353811149,
"grad_norm": 0.920251190662384,
"learning_rate": 0.0005263344451321572,
"loss": 0.9433,
"step": 41000
},
{
"epoch": 4.778156996587031,
"grad_norm": 0.7446300983428955,
"learning_rate": 0.000510930166515435,
"loss": 0.9433,
"step": 42000
},
{
"epoch": 4.891922639362912,
"grad_norm": 0.9206159114837646,
"learning_rate": 0.0004953454965801175,
"loss": 0.9442,
"step": 43000
},
{
"epoch": 5.005688282138794,
"grad_norm": 0.9331917762756348,
"learning_rate": 0.00047960577865027823,
"loss": 0.9357,
"step": 44000
},
{
"epoch": 5.1194539249146755,
"grad_norm": 0.8155319094657898,
"learning_rate": 0.0004637366081844012,
"loss": 0.8818,
"step": 45000
},
{
"epoch": 5.1194539249146755,
"eval_accuracy": 0.74508,
"eval_loss": 1.0118529796600342,
"eval_runtime": 10.3937,
"eval_samples_per_second": 24053.141,
"eval_steps_per_second": 47.048,
"step": 45000
},
{
"epoch": 5.233219567690558,
"grad_norm": 0.9535221457481384,
"learning_rate": 0.0004477637911528123,
"loss": 0.884,
"step": 46000
},
{
"epoch": 5.346985210466439,
"grad_norm": 0.9470248222351074,
"learning_rate": 0.0004317293826802243,
"loss": 0.8892,
"step": 47000
},
{
"epoch": 5.460750853242321,
"grad_norm": 0.87973952293396,
"learning_rate": 0.0004156273608793912,
"loss": 0.8903,
"step": 48000
},
{
"epoch": 5.5745164960182025,
"grad_norm": 0.8131686449050903,
"learning_rate": 0.0003995160577323998,
"loss": 0.8871,
"step": 49000
},
{
"epoch": 5.688282138794084,
"grad_norm": 0.8810710906982422,
"learning_rate": 0.0003833894224734173,
"loss": 0.8868,
"step": 50000
},
{
"epoch": 5.688282138794084,
"eval_accuracy": 0.74856,
"eval_loss": 0.9947025179862976,
"eval_runtime": 10.546,
"eval_samples_per_second": 23705.725,
"eval_steps_per_second": 46.368,
"step": 50000
},
{
"epoch": 5.802047781569966,
"grad_norm": 0.9855514764785767,
"learning_rate": 0.0003673058762504636,
"loss": 0.8876,
"step": 51000
},
{
"epoch": 5.915813424345847,
"grad_norm": 0.9044457674026489,
"learning_rate": 0.0003512593787221045,
"loss": 0.8879,
"step": 52000
},
{
"epoch": 6.0295790671217295,
"grad_norm": 0.7832645773887634,
"learning_rate": 0.0003353239798219901,
"loss": 0.8673,
"step": 53000
},
{
"epoch": 6.143344709897611,
"grad_norm": 0.8719345331192017,
"learning_rate": 0.0003194617325587946,
"loss": 0.8263,
"step": 54000
},
{
"epoch": 6.257110352673493,
"grad_norm": 0.8254925608634949,
"learning_rate": 0.0003037304542170158,
"loss": 0.8323,
"step": 55000
},
{
"epoch": 6.257110352673493,
"eval_accuracy": 0.74912,
"eval_loss": 1.0007187128067017,
"eval_runtime": 11.7739,
"eval_samples_per_second": 21233.418,
"eval_steps_per_second": 41.533,
"step": 55000
},
{
"epoch": 6.370875995449374,
"grad_norm": 0.9038862586021423,
"learning_rate": 0.00028815572653093183,
"loss": 0.8324,
"step": 56000
},
{
"epoch": 6.484641638225256,
"grad_norm": 0.9034783244132996,
"learning_rate": 0.0002727781703007723,
"loss": 0.8346,
"step": 57000
},
{
"epoch": 6.598407281001138,
"grad_norm": 0.9226493239402771,
"learning_rate": 0.0002576372347370359,
"loss": 0.8362,
"step": 58000
},
{
"epoch": 6.712172923777019,
"grad_norm": 0.964350163936615,
"learning_rate": 0.0002426819230705446,
"loss": 0.8375,
"step": 59000
},
{
"epoch": 6.825938566552901,
"grad_norm": 0.9121440052986145,
"learning_rate": 0.00022798243735498786,
"loss": 0.838,
"step": 60000
},
{
"epoch": 6.825938566552901,
"eval_accuracy": 0.752208,
"eval_loss": 0.985443651676178,
"eval_runtime": 11.2968,
"eval_samples_per_second": 22130.153,
"eval_steps_per_second": 43.287,
"step": 60000
},
{
"epoch": 6.939704209328783,
"grad_norm": 0.9455272555351257,
"learning_rate": 0.00021356268145433904,
"loss": 0.83,
"step": 61000
},
{
"epoch": 7.053469852104665,
"grad_norm": 0.888077437877655,
"learning_rate": 0.0001994461043443134,
"loss": 0.8038,
"step": 62000
},
{
"epoch": 7.167235494880546,
"grad_norm": 0.9904555678367615,
"learning_rate": 0.00018565566198034617,
"loss": 0.7762,
"step": 63000
},
{
"epoch": 7.281001137656427,
"grad_norm": 0.9325763583183289,
"learning_rate": 0.00017221377996730371,
"loss": 0.7832,
"step": 64000
},
{
"epoch": 7.39476678043231,
"grad_norm": 1.1246719360351562,
"learning_rate": 0.00015915519640338763,
"loss": 0.7835,
"step": 65000
},
{
"epoch": 7.39476678043231,
"eval_accuracy": 0.752124,
"eval_loss": 0.9989385604858398,
"eval_runtime": 11.8501,
"eval_samples_per_second": 21096.838,
"eval_steps_per_second": 41.265,
"step": 65000
},
{
"epoch": 7.508532423208191,
"grad_norm": 0.7936950922012329,
"learning_rate": 0.0001464750070030455,
"loss": 0.785,
"step": 66000
},
{
"epoch": 7.622298065984073,
"grad_norm": 1.1911970376968384,
"learning_rate": 0.0001342191476487465,
"loss": 0.7855,
"step": 67000
},
{
"epoch": 7.736063708759954,
"grad_norm": 1.1150842905044556,
"learning_rate": 0.00012238301552628276,
"loss": 0.7876,
"step": 68000
},
{
"epoch": 7.849829351535837,
"grad_norm": 0.8056913018226624,
"learning_rate": 0.00011100948879440256,
"loss": 0.7854,
"step": 69000
},
{
"epoch": 7.963594994311718,
"grad_norm": 0.9775083065032959,
"learning_rate": 0.00010009429600800158,
"loss": 0.7836,
"step": 70000
},
{
"epoch": 7.963594994311718,
"eval_accuracy": 0.753456,
"eval_loss": 0.9900269508361816,
"eval_runtime": 11.6532,
"eval_samples_per_second": 21453.25,
"eval_steps_per_second": 41.963,
"step": 70000
},
{
"epoch": 8.0773606370876,
"grad_norm": 0.8832221627235413,
"learning_rate": 8.966680090652002e-05,
"loss": 0.7513,
"step": 71000
},
{
"epoch": 8.19112627986348,
"grad_norm": 1.0215216875076294,
"learning_rate": 7.97439603705802e-05,
"loss": 0.7438,
"step": 72000
},
{
"epoch": 8.304891922639364,
"grad_norm": 0.9041171669960022,
"learning_rate": 7.035104738078215e-05,
"loss": 0.7425,
"step": 73000
},
{
"epoch": 8.418657565415245,
"grad_norm": 0.9626539349555969,
"learning_rate": 6.148453433191126e-05,
"loss": 0.7463,
"step": 74000
},
{
"epoch": 8.532423208191126,
"grad_norm": 1.0108016729354858,
"learning_rate": 5.317654149350526e-05,
"loss": 0.7451,
"step": 75000
},
{
"epoch": 8.532423208191126,
"eval_accuracy": 0.752852,
"eval_loss": 1.00435471534729,
"eval_runtime": 11.8355,
"eval_samples_per_second": 21122.845,
"eval_steps_per_second": 41.316,
"step": 75000
},
{
"epoch": 8.646188850967008,
"grad_norm": 0.9667897820472717,
"learning_rate": 4.5423948713286365e-05,
"loss": 0.7421,
"step": 76000
},
{
"epoch": 8.759954493742889,
"grad_norm": 0.8784095644950867,
"learning_rate": 3.825484091802838e-05,
"loss": 0.7486,
"step": 77000
},
{
"epoch": 8.873720136518772,
"grad_norm": 0.9861534833908081,
"learning_rate": 3.166652567546153e-05,
"loss": 0.7426,
"step": 78000
},
{
"epoch": 8.987485779294653,
"grad_norm": 0.9912355542182922,
"learning_rate": 2.5682870142857394e-05,
"loss": 0.7428,
"step": 79000
},
{
"epoch": 9.101251422070535,
"grad_norm": 0.8502938747406006,
"learning_rate": 2.0301627096753005e-05,
"loss": 0.7207,
"step": 80000
},
{
"epoch": 9.101251422070535,
"eval_accuracy": 0.753112,
"eval_loss": 1.0053608417510986,
"eval_runtime": 12.237,
"eval_samples_per_second": 20429.87,
"eval_steps_per_second": 39.961,
"step": 80000
},
{
"epoch": 9.215017064846416,
"grad_norm": 0.8870697617530823,
"learning_rate": 1.5537838186957887e-05,
"loss": 0.7224,
"step": 81000
},
{
"epoch": 9.328782707622299,
"grad_norm": 0.9085851907730103,
"learning_rate": 1.1399250144500695e-05,
"loss": 0.7236,
"step": 82000
},
{
"epoch": 9.44254835039818,
"grad_norm": 0.9153009057044983,
"learning_rate": 7.895782041054834e-06,
"loss": 0.7208,
"step": 83000
},
{
"epoch": 9.556313993174061,
"grad_norm": 0.9525237679481506,
"learning_rate": 5.026118113090661e-06,
"loss": 0.7208,
"step": 84000
},
{
"epoch": 9.670079635949943,
"grad_norm": 0.9772939682006836,
"learning_rate": 2.8006541473553527e-06,
"loss": 0.721,
"step": 85000
},
{
"epoch": 9.670079635949943,
"eval_accuracy": 0.75292,
"eval_loss": 1.0081429481506348,
"eval_runtime": 12.3975,
"eval_samples_per_second": 20165.284,
"eval_steps_per_second": 39.443,
"step": 85000
},
{
"epoch": 9.783845278725824,
"grad_norm": 0.9219892024993896,
"learning_rate": 1.2198128698185597e-06,
"loss": 0.7207,
"step": 86000
},
{
"epoch": 9.897610921501707,
"grad_norm": 0.8879119157791138,
"learning_rate": 2.8555010796385004e-07,
"loss": 0.7195,
"step": 87000
},
{
"epoch": 10.0,
"step": 87900,
"total_flos": 9.6637212e+17,
"train_loss": 1.0272872150581716,
"train_runtime": 4575.9892,
"train_samples_per_second": 9833.939,
"train_steps_per_second": 19.209
}
],
"logging_steps": 1000,
"max_steps": 87900,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 5000,
"total_flos": 9.6637212e+17,
"train_batch_size": 512,
"trial_name": null,
"trial_params": null
}