|
{ |
|
"best_metric": 1.4878435134887695, |
|
"best_model_checkpoint": "lora_lr_pad/mistralai/Mistral-7B-Instruct-v0.2/unaligned/checkpoint-500", |
|
"epoch": 0.655150351887396, |
|
"eval_steps": 20, |
|
"global_step": 512, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012795905310300703, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 3.2562, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0025591810620601407, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 3.152, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.003838771593090211, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 6e-06, |
|
"loss": 3.101, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.005118362124120281, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 3.2665, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.006397952655150352, |
|
"grad_norm": 1.375, |
|
"learning_rate": 1e-05, |
|
"loss": 3.2401, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.007677543186180422, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.2e-05, |
|
"loss": 3.1574, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.008957133717210493, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 3.1197, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.010236724248240563, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 3.2179, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.011516314779270634, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 1.8e-05, |
|
"loss": 3.2687, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.012795905310300703, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2e-05, |
|
"loss": 3.2973, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014075495841330775, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 3.1228, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.015355086372360844, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.4e-05, |
|
"loss": 3.0316, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.016634676903390915, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 3.2293, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.017914267434420986, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 3.2405, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.019193857965451054, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 3e-05, |
|
"loss": 3.217, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.020473448496481125, |
|
"grad_norm": 1.375, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 3.092, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.021753039027511197, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 3.2057, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.023032629558541268, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 3.6e-05, |
|
"loss": 2.9932, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02431222008957134, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 3.8e-05, |
|
"loss": 3.0673, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.025591810620601407, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4e-05, |
|
"loss": 2.9652, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.025591810620601407, |
|
"eval_loss": 2.928436756134033, |
|
"eval_runtime": 103.8047, |
|
"eval_samples_per_second": 48.167, |
|
"eval_steps_per_second": 1.512, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.026871401151631478, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.2e-05, |
|
"loss": 2.9173, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.02815099168266155, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 2.9455, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.02943058221369162, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 2.909, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.030710172744721688, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.8e-05, |
|
"loss": 2.8191, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03198976327575176, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 5e-05, |
|
"loss": 2.7853, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03326935380678183, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 2.7678, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0345489443378119, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 2.6028, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03582853486884197, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 2.6369, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.037108125399872044, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 5.8e-05, |
|
"loss": 2.6158, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03838771593090211, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 6e-05, |
|
"loss": 2.2824, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03966730646193218, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 6.2e-05, |
|
"loss": 2.4286, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.04094689699296225, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 2.2919, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04222648752399232, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 6.6e-05, |
|
"loss": 2.2725, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04350607805502239, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 2.202, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.044785668586052464, |
|
"grad_norm": 1.375, |
|
"learning_rate": 7e-05, |
|
"loss": 2.1415, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.046065259117082535, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 7.2e-05, |
|
"loss": 2.0692, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04734484964811261, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 7.4e-05, |
|
"loss": 2.1186, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04862444017914268, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.9482, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04990403071017274, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.8985, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.05118362124120281, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 8e-05, |
|
"loss": 1.844, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05118362124120281, |
|
"eval_loss": 1.8063277006149292, |
|
"eval_runtime": 103.8793, |
|
"eval_samples_per_second": 48.133, |
|
"eval_steps_per_second": 1.511, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.052463211772232884, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.8771, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.053742802303262956, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.7902, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.05502239283429303, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.7946, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0563019833653231, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.751, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.05758157389635317, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 9e-05, |
|
"loss": 1.7537, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05886116442738324, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.7619, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.060140754958413305, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.7668, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.061420345489443376, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.7556, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06269993602047345, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.7158, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.06397952655150352, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7301, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06525911708253358, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00010200000000000001, |
|
"loss": 1.7979, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06653870761356366, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00010400000000000001, |
|
"loss": 1.6356, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06781829814459372, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00010600000000000002, |
|
"loss": 1.6094, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0690978886756238, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00010800000000000001, |
|
"loss": 1.6467, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.07037747920665387, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 1.576, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07165706973768395, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 1.6307, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07293666026871401, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 1.6133, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07421625079974409, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.000116, |
|
"loss": 1.6575, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.07549584133077415, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.000118, |
|
"loss": 1.5782, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.07677543186180422, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00012, |
|
"loss": 1.6386, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07677543186180422, |
|
"eval_loss": 1.5920685529708862, |
|
"eval_runtime": 103.8813, |
|
"eval_samples_per_second": 48.132, |
|
"eval_steps_per_second": 1.511, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0780550223928343, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.000122, |
|
"loss": 1.5949, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.07933461292386436, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.000124, |
|
"loss": 1.6136, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.08061420345489444, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.000126, |
|
"loss": 1.6135, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0818937939859245, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 1.5579, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.08317338451695458, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 1.6174, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08445297504798464, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.000132, |
|
"loss": 1.6687, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08573256557901472, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.000134, |
|
"loss": 1.604, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.08701215611004479, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00013600000000000003, |
|
"loss": 1.5936, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08829174664107485, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.000138, |
|
"loss": 1.5744, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.08957133717210493, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00014, |
|
"loss": 1.5875, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09085092770313499, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.000142, |
|
"loss": 1.5938, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.09213051823416507, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.000144, |
|
"loss": 1.5795, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09341010876519514, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.000146, |
|
"loss": 1.5642, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.09468969929622521, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.000148, |
|
"loss": 1.6276, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.09596928982725528, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.6222, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09724888035828536, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.000152, |
|
"loss": 1.5487, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.09852847088931542, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.000154, |
|
"loss": 1.6183, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.09980806142034548, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.00015600000000000002, |
|
"loss": 1.5813, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.10108765195137556, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00015800000000000002, |
|
"loss": 1.5332, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.10236724248240563, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5553, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10236724248240563, |
|
"eval_loss": 1.5541130304336548, |
|
"eval_runtime": 103.8472, |
|
"eval_samples_per_second": 48.148, |
|
"eval_steps_per_second": 1.512, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1036468330134357, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.000162, |
|
"loss": 1.5697, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.10492642354446577, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.000164, |
|
"loss": 1.5474, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.10620601407549585, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.000166, |
|
"loss": 1.5834, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.10748560460652591, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.000168, |
|
"loss": 1.542, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.10876519513755598, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00017, |
|
"loss": 1.5841, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.11004478566858605, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.000172, |
|
"loss": 1.515, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.11132437619961612, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.000174, |
|
"loss": 1.5565, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.1126039667306462, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.00017600000000000002, |
|
"loss": 1.6133, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.11388355726167626, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00017800000000000002, |
|
"loss": 1.5476, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.11516314779270634, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00018, |
|
"loss": 1.5212, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1164427383237364, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.000182, |
|
"loss": 1.5628, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.11772232885476648, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00018400000000000003, |
|
"loss": 1.4946, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.11900191938579655, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00018600000000000002, |
|
"loss": 1.6303, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.12028150991682661, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.000188, |
|
"loss": 1.5155, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.12156110044785669, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.00019, |
|
"loss": 1.5652, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.12284069097888675, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.000192, |
|
"loss": 1.6042, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.12412028150991683, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.000194, |
|
"loss": 1.5697, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.1253998720409469, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.000196, |
|
"loss": 1.4785, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.12667946257197696, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.00019800000000000002, |
|
"loss": 1.4921, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.12795905310300704, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4985, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12795905310300704, |
|
"eval_loss": 1.5341166257858276, |
|
"eval_runtime": 103.7833, |
|
"eval_samples_per_second": 48.177, |
|
"eval_steps_per_second": 1.513, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12923864363403711, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00019951456310679614, |
|
"loss": 1.494, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.13051823416506717, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00019902912621359224, |
|
"loss": 1.5407, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.13179782469609724, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00019854368932038837, |
|
"loss": 1.5755, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.13307741522712732, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.00019805825242718447, |
|
"loss": 1.5491, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1343570057581574, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0001975728155339806, |
|
"loss": 1.5393, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.13563659628918745, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.0001970873786407767, |
|
"loss": 1.5657, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.13691618682021753, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.00019660194174757283, |
|
"loss": 1.5551, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.1381957773512476, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00019611650485436895, |
|
"loss": 1.5408, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.13947536788227768, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00019563106796116505, |
|
"loss": 1.5375, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.14075495841330773, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.00019514563106796118, |
|
"loss": 1.5578, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1420345489443378, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00019466019417475728, |
|
"loss": 1.4969, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.1433141394753679, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0001941747572815534, |
|
"loss": 1.5347, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.14459373000639794, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019368932038834954, |
|
"loss": 1.549, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.14587332053742802, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00019320388349514564, |
|
"loss": 1.5749, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.1471529110684581, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00019271844660194177, |
|
"loss": 1.5311, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.14843250159948818, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00019223300970873787, |
|
"loss": 1.5678, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.14971209213051823, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.000191747572815534, |
|
"loss": 1.616, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.1509916826615483, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.0001912621359223301, |
|
"loss": 1.5449, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.15227127319257838, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00019077669902912623, |
|
"loss": 1.4975, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.15355086372360843, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00019029126213592236, |
|
"loss": 1.5631, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15355086372360843, |
|
"eval_loss": 1.5212680101394653, |
|
"eval_runtime": 103.8151, |
|
"eval_samples_per_second": 48.163, |
|
"eval_steps_per_second": 1.512, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1548304542546385, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.00018980582524271846, |
|
"loss": 1.5534, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.1561100447856686, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00018932038834951458, |
|
"loss": 1.5329, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.15738963531669867, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00018883495145631069, |
|
"loss": 1.5372, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.15866922584772872, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00018834951456310681, |
|
"loss": 1.5373, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.1599488163787588, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00018786407766990291, |
|
"loss": 1.5492, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.16122840690978887, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00018737864077669904, |
|
"loss": 1.536, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.16250799744081892, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00018689320388349517, |
|
"loss": 1.5719, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.163787587971849, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00018640776699029127, |
|
"loss": 1.5101, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.16506717850287908, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0001859223300970874, |
|
"loss": 1.5896, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.16634676903390916, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0001854368932038835, |
|
"loss": 1.5791, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1676263595649392, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00018495145631067963, |
|
"loss": 1.5013, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.1689059500959693, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00018446601941747576, |
|
"loss": 1.5421, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.17018554062699937, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00018398058252427186, |
|
"loss": 1.5329, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.17146513115802944, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00018349514563106799, |
|
"loss": 1.5631, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.1727447216890595, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0001830097087378641, |
|
"loss": 1.5278, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.17402431222008957, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00018252427184466022, |
|
"loss": 1.5548, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.17530390275111965, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00018203883495145632, |
|
"loss": 1.5023, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.1765834932821497, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00018155339805825244, |
|
"loss": 1.5062, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.17786308381317978, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00018106796116504857, |
|
"loss": 1.5637, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.17914267434420986, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00018058252427184467, |
|
"loss": 1.5201, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17914267434420986, |
|
"eval_loss": 1.513644814491272, |
|
"eval_runtime": 103.8133, |
|
"eval_samples_per_second": 48.163, |
|
"eval_steps_per_second": 1.512, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.18042226487523993, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0001800970873786408, |
|
"loss": 1.5106, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.18170185540626999, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.0001796116504854369, |
|
"loss": 1.5403, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.18298144593730006, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00017912621359223303, |
|
"loss": 1.5373, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.18426103646833014, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00017864077669902913, |
|
"loss": 1.5697, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.1855406269993602, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00017815533980582526, |
|
"loss": 1.5135, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.18682021753039027, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0001776699029126214, |
|
"loss": 1.5446, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.18809980806142035, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0001771844660194175, |
|
"loss": 1.5226, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.18937939859245043, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00017669902912621362, |
|
"loss": 1.4343, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.19065898912348048, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00017621359223300972, |
|
"loss": 1.5509, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.19193857965451055, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00017572815533980585, |
|
"loss": 1.5397, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.19321817018554063, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00017524271844660195, |
|
"loss": 1.4796, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.1944977607165707, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00017475728155339805, |
|
"loss": 1.5855, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.19577735124760076, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00017427184466019418, |
|
"loss": 1.5221, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.19705694177863084, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00017378640776699028, |
|
"loss": 1.4963, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.19833653230966092, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0001733009708737864, |
|
"loss": 1.5379, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.19961612284069097, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.00017281553398058253, |
|
"loss": 1.4701, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.20089571337172105, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00017233009708737864, |
|
"loss": 1.5133, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.20217530390275112, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00017184466019417476, |
|
"loss": 1.5238, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.2034548944337812, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00017135922330097086, |
|
"loss": 1.5361, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.20473448496481125, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.000170873786407767, |
|
"loss": 1.4585, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20473448496481125, |
|
"eval_loss": 1.5087493658065796, |
|
"eval_runtime": 103.8289, |
|
"eval_samples_per_second": 48.156, |
|
"eval_steps_per_second": 1.512, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20601407549584133, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0001703883495145631, |
|
"loss": 1.5574, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.2072936660268714, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00016990291262135922, |
|
"loss": 1.4938, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.20857325655790146, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00016941747572815535, |
|
"loss": 1.5307, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.20985284708893154, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00016893203883495145, |
|
"loss": 1.4849, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.21113243761996162, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00016844660194174758, |
|
"loss": 1.4399, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2124120281509917, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00016796116504854368, |
|
"loss": 1.512, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.21369161868202174, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0001674757281553398, |
|
"loss": 1.5586, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.21497120921305182, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00016699029126213594, |
|
"loss": 1.5673, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.2162507997440819, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.00016650485436893204, |
|
"loss": 1.4893, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.21753039027511195, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00016601941747572817, |
|
"loss": 1.5885, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21880998080614203, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00016553398058252427, |
|
"loss": 1.5318, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.2200895713371721, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0001650485436893204, |
|
"loss": 1.4523, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.22136916186820219, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0001645631067961165, |
|
"loss": 1.5486, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.22264875239923224, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00016407766990291262, |
|
"loss": 1.4989, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.22392834293026231, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00016359223300970875, |
|
"loss": 1.5556, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2252079334612924, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00016310679611650485, |
|
"loss": 1.545, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.22648752399232247, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00016262135922330098, |
|
"loss": 1.4939, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.22776711452335252, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00016213592233009708, |
|
"loss": 1.4768, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.2290467050543826, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0001616504854368932, |
|
"loss": 1.494, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.23032629558541268, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0001611650485436893, |
|
"loss": 1.5361, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.23032629558541268, |
|
"eval_loss": 1.5049980878829956, |
|
"eval_runtime": 103.8247, |
|
"eval_samples_per_second": 48.158, |
|
"eval_steps_per_second": 1.512, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.23160588611644273, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00016067961165048544, |
|
"loss": 1.5126, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.2328854766474728, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.00016019417475728157, |
|
"loss": 1.4835, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.23416506717850288, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00015970873786407767, |
|
"loss": 1.5131, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.23544465770953296, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0001592233009708738, |
|
"loss": 1.4804, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.236724248240563, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0001587378640776699, |
|
"loss": 1.6027, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2380038387715931, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00015825242718446603, |
|
"loss": 1.5373, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.23928342930262317, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00015776699029126213, |
|
"loss": 1.5531, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.24056301983365322, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00015728155339805825, |
|
"loss": 1.5101, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.2418426103646833, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00015679611650485438, |
|
"loss": 1.538, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.24312220089571338, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00015631067961165048, |
|
"loss": 1.526, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.24440179142674345, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.0001558252427184466, |
|
"loss": 1.5275, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.2456813819577735, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0001553398058252427, |
|
"loss": 1.567, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.24696097248880358, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00015485436893203884, |
|
"loss": 1.4457, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.24824056301983366, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00015436893203883497, |
|
"loss": 1.5728, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.2495201535508637, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00015388349514563107, |
|
"loss": 1.4829, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2507997440818938, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.0001533980582524272, |
|
"loss": 1.5093, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.25207933461292387, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0001529126213592233, |
|
"loss": 1.5079, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.2533589251439539, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00015242718446601943, |
|
"loss": 1.4996, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.254638515674984, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.00015194174757281553, |
|
"loss": 1.4967, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.2559181062060141, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00015145631067961166, |
|
"loss": 1.485, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2559181062060141, |
|
"eval_loss": 1.5022693872451782, |
|
"eval_runtime": 103.8087, |
|
"eval_samples_per_second": 48.166, |
|
"eval_steps_per_second": 1.512, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2571976967370441, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00015097087378640778, |
|
"loss": 1.4926, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.25847728726807423, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00015048543689320389, |
|
"loss": 1.5215, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.2597568777991043, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.5674, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.26103646833013433, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00014951456310679611, |
|
"loss": 1.5157, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.26231605886116444, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00014902912621359224, |
|
"loss": 1.4502, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2635956493921945, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00014854368932038834, |
|
"loss": 1.5289, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.2648752399232246, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00014805825242718447, |
|
"loss": 1.4454, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.26615483045425464, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0001475728155339806, |
|
"loss": 1.5132, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.2674344209852847, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001470873786407767, |
|
"loss": 1.5041, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.2687140115163148, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00014660194174757283, |
|
"loss": 1.5313, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.26999360204734485, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00014611650485436893, |
|
"loss": 1.5156, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.2712731925783749, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00014563106796116506, |
|
"loss": 1.4958, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.272552783109405, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0001451456310679612, |
|
"loss": 1.5324, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.27383237364043506, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0001446601941747573, |
|
"loss": 1.4894, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.2751119641714651, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00014417475728155342, |
|
"loss": 1.4462, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2763915547024952, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00014368932038834952, |
|
"loss": 1.5, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.27767114523352526, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00014320388349514565, |
|
"loss": 1.5317, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.27895073576455537, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00014271844660194175, |
|
"loss": 1.5553, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.2802303262955854, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00014223300970873787, |
|
"loss": 1.5055, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.28150991682661547, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.000141747572815534, |
|
"loss": 1.5299, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.28150991682661547, |
|
"eval_loss": 1.4998944997787476, |
|
"eval_runtime": 103.8007, |
|
"eval_samples_per_second": 48.169, |
|
"eval_steps_per_second": 1.513, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2827895073576456, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0001412621359223301, |
|
"loss": 1.5298, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.2840690978886756, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00014077669902912623, |
|
"loss": 1.5178, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.2853486884197057, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00014029126213592233, |
|
"loss": 1.4975, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.2866282789507358, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00013980582524271846, |
|
"loss": 1.5121, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.28790786948176583, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00013932038834951456, |
|
"loss": 1.4838, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2891874600127959, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001388349514563107, |
|
"loss": 1.4422, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.290467050543826, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00013834951456310682, |
|
"loss": 1.5315, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.29174664107485604, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00013786407766990292, |
|
"loss": 1.524, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.2930262316058861, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00013737864077669905, |
|
"loss": 1.4314, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.2943058221369162, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00013689320388349515, |
|
"loss": 1.5496, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.29558541266794625, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00013640776699029128, |
|
"loss": 1.5526, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.29686500319897635, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0001359223300970874, |
|
"loss": 1.427, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.2981445937300064, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0001354368932038835, |
|
"loss": 1.4646, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.29942418426103645, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00013495145631067963, |
|
"loss": 1.5392, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.30070377479206656, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00013446601941747573, |
|
"loss": 1.5333, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3019833653230966, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00013398058252427186, |
|
"loss": 1.5282, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.30326295585412666, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00013349514563106796, |
|
"loss": 1.4763, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.30454254638515676, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0001330097087378641, |
|
"loss": 1.5199, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.3058221369161868, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00013252427184466022, |
|
"loss": 1.5497, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.30710172744721687, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00013203883495145632, |
|
"loss": 1.5539, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.30710172744721687, |
|
"eval_loss": 1.4981228113174438, |
|
"eval_runtime": 103.819, |
|
"eval_samples_per_second": 48.161, |
|
"eval_steps_per_second": 1.512, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.30838131797824697, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00013155339805825245, |
|
"loss": 1.5309, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.309660908509277, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00013106796116504855, |
|
"loss": 1.4503, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.31094049904030713, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00013058252427184468, |
|
"loss": 1.4718, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.3122200895713372, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00013009708737864078, |
|
"loss": 1.4445, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.31349968010236723, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0001296116504854369, |
|
"loss": 1.5606, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.31477927063339733, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00012912621359223304, |
|
"loss": 1.4917, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.3160588611644274, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00012864077669902914, |
|
"loss": 1.5295, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.31733845169545744, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00012815533980582526, |
|
"loss": 1.5685, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.31861804222648754, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00012766990291262137, |
|
"loss": 1.4985, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.3198976327575176, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001271844660194175, |
|
"loss": 1.4746, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.32117722328854764, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00012669902912621362, |
|
"loss": 1.5615, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.32245681381957775, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00012621359223300972, |
|
"loss": 1.4777, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.3237364043506078, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00012572815533980585, |
|
"loss": 1.4152, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.32501599488163785, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00012524271844660195, |
|
"loss": 1.4632, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.32629558541266795, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00012475728155339805, |
|
"loss": 1.5011, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.327575175943698, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00012427184466019418, |
|
"loss": 1.5339, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.3288547664747281, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00012378640776699028, |
|
"loss": 1.4776, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.33013435700575816, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001233009708737864, |
|
"loss": 1.4585, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.3314139475367882, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0001228155339805825, |
|
"loss": 1.4791, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.3326935380678183, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00012233009708737864, |
|
"loss": 1.521, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3326935380678183, |
|
"eval_loss": 1.4961707592010498, |
|
"eval_runtime": 103.7907, |
|
"eval_samples_per_second": 48.174, |
|
"eval_steps_per_second": 1.513, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.33397312859884837, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00012184466019417475, |
|
"loss": 1.5015, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.3352527191298784, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00012135922330097087, |
|
"loss": 1.4732, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.3365323096609085, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00012087378640776698, |
|
"loss": 1.4832, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.3378119001919386, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0001203883495145631, |
|
"loss": 1.4629, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.3390914907229686, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00011990291262135923, |
|
"loss": 1.5046, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.34037108125399873, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00011941747572815534, |
|
"loss": 1.5724, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.3416506717850288, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00011893203883495146, |
|
"loss": 1.4481, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.3429302623160589, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00011844660194174757, |
|
"loss": 1.5081, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.34420985284708894, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00011796116504854368, |
|
"loss": 1.5056, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.345489443378119, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0001174757281553398, |
|
"loss": 1.5279, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3467690339091491, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00011699029126213593, |
|
"loss": 1.5293, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.34804862444017914, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00011650485436893204, |
|
"loss": 1.5436, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.3493282149712092, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00011601941747572816, |
|
"loss": 1.5806, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.3506078055022393, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00011553398058252427, |
|
"loss": 1.5412, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.35188739603326935, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00011504854368932039, |
|
"loss": 1.5225, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3531669865642994, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0001145631067961165, |
|
"loss": 1.6229, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.3544465770953295, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00011407766990291261, |
|
"loss": 1.5231, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.35572616762635956, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00011359223300970874, |
|
"loss": 1.481, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.3570057581573896, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00011310679611650486, |
|
"loss": 1.544, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.3582853486884197, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00011262135922330097, |
|
"loss": 1.5186, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3582853486884197, |
|
"eval_loss": 1.4947106838226318, |
|
"eval_runtime": 103.7757, |
|
"eval_samples_per_second": 48.181, |
|
"eval_steps_per_second": 1.513, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.35956493921944976, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00011213592233009709, |
|
"loss": 1.5507, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.36084452975047987, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0001116504854368932, |
|
"loss": 1.5091, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.3621241202815099, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00011116504854368932, |
|
"loss": 1.4962, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.36340371081253997, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00011067961165048544, |
|
"loss": 1.537, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.3646833013435701, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00011019417475728156, |
|
"loss": 1.5814, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3659628918746001, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00010970873786407767, |
|
"loss": 1.5328, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.3672424824056302, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00010922330097087379, |
|
"loss": 1.5429, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.3685220729366603, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0001087378640776699, |
|
"loss": 1.489, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.36980166346769033, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00010825242718446602, |
|
"loss": 1.518, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.3710812539987204, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00010776699029126213, |
|
"loss": 1.5142, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3723608445297505, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00010728155339805826, |
|
"loss": 1.5229, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.37364043506078054, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00010679611650485437, |
|
"loss": 1.4803, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.37492002559181065, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00010631067961165049, |
|
"loss": 1.5791, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.3761996161228407, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0001058252427184466, |
|
"loss": 1.5336, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.37747920665387075, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00010533980582524272, |
|
"loss": 1.4581, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.37875879718490085, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00010485436893203883, |
|
"loss": 1.4919, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.3800383877159309, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00010436893203883496, |
|
"loss": 1.5778, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.38131797824696095, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00010388349514563107, |
|
"loss": 1.4489, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.38259756877799106, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00010339805825242719, |
|
"loss": 1.4704, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.3838771593090211, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0001029126213592233, |
|
"loss": 1.5316, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3838771593090211, |
|
"eval_loss": 1.4935728311538696, |
|
"eval_runtime": 103.76, |
|
"eval_samples_per_second": 48.188, |
|
"eval_steps_per_second": 1.513, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.38515674984005116, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00010242718446601942, |
|
"loss": 1.5093, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.38643634037108127, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00010194174757281553, |
|
"loss": 1.4871, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.3877159309021113, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00010145631067961166, |
|
"loss": 1.4842, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.3889955214331414, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00010097087378640778, |
|
"loss": 1.5545, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.3902751119641715, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00010048543689320389, |
|
"loss": 1.5671, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3915547024952015, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4817, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.39283429302623163, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 9.951456310679612e-05, |
|
"loss": 1.4727, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.3941138835572617, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.902912621359223e-05, |
|
"loss": 1.4717, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.39539347408829173, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 9.854368932038835e-05, |
|
"loss": 1.4795, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.39667306461932184, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 9.805825242718448e-05, |
|
"loss": 1.4475, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3979526551503519, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 9.757281553398059e-05, |
|
"loss": 1.4661, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.39923224568138194, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 9.70873786407767e-05, |
|
"loss": 1.4808, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.40051183621241204, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 9.660194174757282e-05, |
|
"loss": 1.5169, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.4017914267434421, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 9.611650485436893e-05, |
|
"loss": 1.4856, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.40307101727447214, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.563106796116505e-05, |
|
"loss": 1.4995, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.40435060780550225, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 9.514563106796118e-05, |
|
"loss": 1.5682, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.4056301983365323, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.466019417475729e-05, |
|
"loss": 1.492, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.4069097888675624, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 9.417475728155341e-05, |
|
"loss": 1.4706, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.40818937939859246, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.368932038834952e-05, |
|
"loss": 1.5184, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.4094689699296225, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 9.320388349514564e-05, |
|
"loss": 1.5288, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4094689699296225, |
|
"eval_loss": 1.4924702644348145, |
|
"eval_runtime": 103.7588, |
|
"eval_samples_per_second": 48.189, |
|
"eval_steps_per_second": 1.513, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4107485604606526, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 9.271844660194175e-05, |
|
"loss": 1.5166, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.41202815099168266, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.223300970873788e-05, |
|
"loss": 1.5137, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.4133077415227127, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 9.174757281553399e-05, |
|
"loss": 1.4874, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.4145873320537428, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 9.126213592233011e-05, |
|
"loss": 1.4144, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.41586692258477287, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.077669902912622e-05, |
|
"loss": 1.529, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.4171465131158029, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.029126213592234e-05, |
|
"loss": 1.5507, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.418426103646833, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 8.980582524271845e-05, |
|
"loss": 1.5447, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.4197056941778631, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 8.932038834951457e-05, |
|
"loss": 1.556, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.4209852847088932, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 8.88349514563107e-05, |
|
"loss": 1.4945, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.42226487523992323, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 8.834951456310681e-05, |
|
"loss": 1.5668, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4235444657709533, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 8.786407766990292e-05, |
|
"loss": 1.501, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.4248240563019834, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 8.737864077669902e-05, |
|
"loss": 1.5187, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.42610364683301344, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 8.689320388349514e-05, |
|
"loss": 1.5736, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.4273832373640435, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 8.640776699029127e-05, |
|
"loss": 1.5085, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.4286628278950736, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 8.592233009708738e-05, |
|
"loss": 1.4757, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.42994241842610365, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 8.54368932038835e-05, |
|
"loss": 1.5243, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.4312220089571337, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 8.495145631067961e-05, |
|
"loss": 1.5662, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.4325015994881638, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 8.446601941747573e-05, |
|
"loss": 1.433, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.43378119001919385, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 8.398058252427184e-05, |
|
"loss": 1.5378, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.4350607805502239, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 8.349514563106797e-05, |
|
"loss": 1.5276, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4350607805502239, |
|
"eval_loss": 1.4914450645446777, |
|
"eval_runtime": 103.7522, |
|
"eval_samples_per_second": 48.192, |
|
"eval_steps_per_second": 1.513, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.436340371081254, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 8.300970873786408e-05, |
|
"loss": 1.4723, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.43761996161228406, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 8.25242718446602e-05, |
|
"loss": 1.5185, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.43889955214331416, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 8.203883495145631e-05, |
|
"loss": 1.5317, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.4401791426743442, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 8.155339805825243e-05, |
|
"loss": 1.5254, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.44145873320537427, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 8.106796116504854e-05, |
|
"loss": 1.5152, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.44273832373640437, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 8.058252427184466e-05, |
|
"loss": 1.4812, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.4440179142674344, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 8.009708737864078e-05, |
|
"loss": 1.5023, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.44529750479846447, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 7.96116504854369e-05, |
|
"loss": 1.4516, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.4465770953294946, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 7.912621359223301e-05, |
|
"loss": 1.4349, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.44785668586052463, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 7.864077669902913e-05, |
|
"loss": 1.5181, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4491362763915547, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 7.815533980582524e-05, |
|
"loss": 1.4765, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.4504158669225848, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 7.766990291262136e-05, |
|
"loss": 1.524, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.45169545745361483, |
|
"grad_norm": 0.25, |
|
"learning_rate": 7.718446601941748e-05, |
|
"loss": 1.5236, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.45297504798464494, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 7.66990291262136e-05, |
|
"loss": 1.5358, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.454254638515675, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 7.621359223300971e-05, |
|
"loss": 1.5447, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.45553422904670504, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 7.572815533980583e-05, |
|
"loss": 1.428, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.45681381957773515, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 7.524271844660194e-05, |
|
"loss": 1.3921, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.4580934101087652, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 7.475728155339806e-05, |
|
"loss": 1.5059, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.45937300063979525, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 7.427184466019417e-05, |
|
"loss": 1.4832, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.46065259117082535, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 7.37864077669903e-05, |
|
"loss": 1.5236, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.46065259117082535, |
|
"eval_loss": 1.4904447793960571, |
|
"eval_runtime": 103.753, |
|
"eval_samples_per_second": 48.191, |
|
"eval_steps_per_second": 1.513, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4619321817018554, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 7.330097087378641e-05, |
|
"loss": 1.5008, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.46321177223288545, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 7.281553398058253e-05, |
|
"loss": 1.5244, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.46449136276391556, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 7.233009708737864e-05, |
|
"loss": 1.5849, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.4657709532949456, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 7.184466019417476e-05, |
|
"loss": 1.4882, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.46705054382597566, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 7.135922330097087e-05, |
|
"loss": 1.4905, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.46833013435700577, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 7.0873786407767e-05, |
|
"loss": 1.4391, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.4696097248880358, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 7.038834951456312e-05, |
|
"loss": 1.5034, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.4708893154190659, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 6.990291262135923e-05, |
|
"loss": 1.4928, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.472168905950096, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 6.941747572815534e-05, |
|
"loss": 1.5578, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.473448496481126, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 6.893203883495146e-05, |
|
"loss": 1.5403, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.47472808701215613, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 6.844660194174757e-05, |
|
"loss": 1.5081, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.4760076775431862, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 6.79611650485437e-05, |
|
"loss": 1.5799, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.47728726807421623, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 6.747572815533982e-05, |
|
"loss": 1.5097, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.47856685860524634, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 6.699029126213593e-05, |
|
"loss": 1.5164, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.4798464491362764, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 6.650485436893205e-05, |
|
"loss": 1.5139, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.48112603966730644, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 6.601941747572816e-05, |
|
"loss": 1.4756, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.48240563019833654, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 6.553398058252428e-05, |
|
"loss": 1.5325, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.4836852207293666, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 6.504854368932039e-05, |
|
"loss": 1.4932, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.4849648112603967, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 6.456310679611652e-05, |
|
"loss": 1.5157, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.48624440179142675, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 6.407766990291263e-05, |
|
"loss": 1.4882, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.48624440179142675, |
|
"eval_loss": 1.4896763563156128, |
|
"eval_runtime": 103.7894, |
|
"eval_samples_per_second": 48.174, |
|
"eval_steps_per_second": 1.513, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4875239923224568, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 6.359223300970875e-05, |
|
"loss": 1.546, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.4888035828534869, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 6.310679611650486e-05, |
|
"loss": 1.4907, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.49008317338451696, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 6.262135922330098e-05, |
|
"loss": 1.4512, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.491362763915547, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 6.213592233009709e-05, |
|
"loss": 1.5442, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.4926423544465771, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 6.16504854368932e-05, |
|
"loss": 1.4975, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.49392194497760716, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 6.116504854368932e-05, |
|
"loss": 1.534, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.4952015355086372, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 6.0679611650485434e-05, |
|
"loss": 1.417, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.4964811260396673, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 6.019417475728155e-05, |
|
"loss": 1.4766, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.49776071657069737, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 5.970873786407767e-05, |
|
"loss": 1.4672, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.4990403071017274, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 5.9223300970873785e-05, |
|
"loss": 1.5021, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5003198976327575, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 5.87378640776699e-05, |
|
"loss": 1.5723, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.5015994881637876, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 5.825242718446602e-05, |
|
"loss": 1.5351, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.5028790786948176, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 5.7766990291262135e-05, |
|
"loss": 1.5922, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.5041586692258477, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 5.728155339805825e-05, |
|
"loss": 1.5, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.5054382597568778, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 5.679611650485437e-05, |
|
"loss": 1.5057, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5067178502879078, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 5.6310679611650486e-05, |
|
"loss": 1.4968, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.5079974408189379, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 5.58252427184466e-05, |
|
"loss": 1.4383, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.509277031349968, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 5.533980582524272e-05, |
|
"loss": 1.5069, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.510556621880998, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 5.4854368932038836e-05, |
|
"loss": 1.4293, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.5118362124120281, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 5.436893203883495e-05, |
|
"loss": 1.5098, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5118362124120281, |
|
"eval_loss": 1.4892088174819946, |
|
"eval_runtime": 103.7401, |
|
"eval_samples_per_second": 48.197, |
|
"eval_steps_per_second": 1.513, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5131158029430583, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 5.3883495145631065e-05, |
|
"loss": 1.4924, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.5143953934740882, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 5.339805825242719e-05, |
|
"loss": 1.5093, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.5156749840051184, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 5.29126213592233e-05, |
|
"loss": 1.5384, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.5169545745361485, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 5.2427184466019416e-05, |
|
"loss": 1.5331, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.5182341650671785, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 5.194174757281554e-05, |
|
"loss": 1.5009, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5195137555982086, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 5.145631067961165e-05, |
|
"loss": 1.4744, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.5207933461292387, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 5.0970873786407766e-05, |
|
"loss": 1.5178, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.5220729366602687, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 5.048543689320389e-05, |
|
"loss": 1.4893, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.5233525271912988, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5309, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.5246321177223289, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 4.951456310679612e-05, |
|
"loss": 1.4873, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.525911708253359, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 4.902912621359224e-05, |
|
"loss": 1.5083, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.527191298784389, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 4.854368932038835e-05, |
|
"loss": 1.5487, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.5284708893154191, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 4.805825242718447e-05, |
|
"loss": 1.4886, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.5297504798464492, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 4.757281553398059e-05, |
|
"loss": 1.4687, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.5310300703774792, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 4.7087378640776703e-05, |
|
"loss": 1.5612, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5323096609085093, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 4.660194174757282e-05, |
|
"loss": 1.534, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.5335892514395394, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.611650485436894e-05, |
|
"loss": 1.4985, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.5348688419705694, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 4.5631067961165054e-05, |
|
"loss": 1.5042, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.5361484325015995, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 4.514563106796117e-05, |
|
"loss": 1.5292, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.5374280230326296, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 4.466019417475728e-05, |
|
"loss": 1.4526, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5374280230326296, |
|
"eval_loss": 1.4887601137161255, |
|
"eval_runtime": 103.7847, |
|
"eval_samples_per_second": 48.177, |
|
"eval_steps_per_second": 1.513, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5387076135636596, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 4.4174757281553404e-05, |
|
"loss": 1.4529, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.5399872040946897, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 4.368932038834951e-05, |
|
"loss": 1.4163, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.5412667946257198, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 4.3203883495145634e-05, |
|
"loss": 1.5169, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.5425463851567498, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 4.271844660194175e-05, |
|
"loss": 1.4888, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.5438259756877799, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 4.223300970873786e-05, |
|
"loss": 1.4733, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.54510556621881, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 4.1747572815533984e-05, |
|
"loss": 1.503, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.54638515674984, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 4.12621359223301e-05, |
|
"loss": 1.4406, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.5476647472808701, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 4.077669902912621e-05, |
|
"loss": 1.4952, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.5489443378119002, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 4.029126213592233e-05, |
|
"loss": 1.4837, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.5502239283429302, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 3.980582524271845e-05, |
|
"loss": 1.6037, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5515035188739603, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 3.9320388349514564e-05, |
|
"loss": 1.4425, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.5527831094049904, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 3.883495145631068e-05, |
|
"loss": 1.4502, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.5540626999360204, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 3.83495145631068e-05, |
|
"loss": 1.4936, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.5553422904670505, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 3.7864077669902914e-05, |
|
"loss": 1.5186, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.5566218809980806, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 3.737864077669903e-05, |
|
"loss": 1.464, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5579014715291107, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 3.689320388349515e-05, |
|
"loss": 1.4973, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.5591810620601407, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 3.6407766990291265e-05, |
|
"loss": 1.5231, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.5604606525911708, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 3.592233009708738e-05, |
|
"loss": 1.5225, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.5617402431222009, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 3.54368932038835e-05, |
|
"loss": 1.4312, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.5630198336532309, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 3.4951456310679615e-05, |
|
"loss": 1.5529, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5630198336532309, |
|
"eval_loss": 1.4884061813354492, |
|
"eval_runtime": 103.7793, |
|
"eval_samples_per_second": 48.179, |
|
"eval_steps_per_second": 1.513, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.564299424184261, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 3.446601941747573e-05, |
|
"loss": 1.4933, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.5655790147152912, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 3.398058252427185e-05, |
|
"loss": 1.4352, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.5668586052463211, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 3.3495145631067966e-05, |
|
"loss": 1.5062, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.5681381957773513, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 3.300970873786408e-05, |
|
"loss": 1.5018, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.5694177863083814, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 3.2524271844660195e-05, |
|
"loss": 1.5037, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5706973768394114, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 3.2038834951456316e-05, |
|
"loss": 1.5056, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.5719769673704415, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 3.155339805825243e-05, |
|
"loss": 1.567, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.5732565579014716, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 3.1067961165048545e-05, |
|
"loss": 1.4753, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.5745361484325016, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 3.058252427184466e-05, |
|
"loss": 1.5731, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.5758157389635317, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 3.0097087378640774e-05, |
|
"loss": 1.4695, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5770953294945618, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 2.9611650485436892e-05, |
|
"loss": 1.4394, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.5783749200255918, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 2.912621359223301e-05, |
|
"loss": 1.52, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.5796545105566219, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 2.8640776699029125e-05, |
|
"loss": 1.5178, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.580934101087652, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 2.8155339805825243e-05, |
|
"loss": 1.5035, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.582213691618682, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 2.766990291262136e-05, |
|
"loss": 1.5201, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5834932821497121, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 2.7184466019417475e-05, |
|
"loss": 1.5495, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.5847728726807422, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 2.6699029126213593e-05, |
|
"loss": 1.483, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.5860524632117722, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 2.6213592233009708e-05, |
|
"loss": 1.5084, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.5873320537428023, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 2.5728155339805826e-05, |
|
"loss": 1.5115, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.5886116442738324, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 2.5242718446601944e-05, |
|
"loss": 1.4747, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5886116442738324, |
|
"eval_loss": 1.4881880283355713, |
|
"eval_runtime": 103.7794, |
|
"eval_samples_per_second": 48.179, |
|
"eval_steps_per_second": 1.513, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5898912348048625, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 2.475728155339806e-05, |
|
"loss": 1.5567, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.5911708253358925, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 2.4271844660194176e-05, |
|
"loss": 1.4473, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.5924504158669226, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 2.3786407766990294e-05, |
|
"loss": 1.5204, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.5937300063979527, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 2.330097087378641e-05, |
|
"loss": 1.537, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.5950095969289827, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 2.2815533980582527e-05, |
|
"loss": 1.4269, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5962891874600128, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 2.233009708737864e-05, |
|
"loss": 1.452, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.5975687779910429, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 2.1844660194174756e-05, |
|
"loss": 1.4702, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.5988483685220729, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 2.1359223300970874e-05, |
|
"loss": 1.4577, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.600127959053103, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 2.0873786407766992e-05, |
|
"loss": 1.5009, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.6014075495841331, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 2.0388349514563107e-05, |
|
"loss": 1.4926, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6026871401151631, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 1.9902912621359225e-05, |
|
"loss": 1.5575, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.6039667306461932, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 1.941747572815534e-05, |
|
"loss": 1.4811, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.6052463211772233, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 1.8932038834951457e-05, |
|
"loss": 1.496, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.6065259117082533, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 1.8446601941747575e-05, |
|
"loss": 1.4922, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.6078055022392834, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.796116504854369e-05, |
|
"loss": 1.5435, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6090850927703135, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 1.7475728155339808e-05, |
|
"loss": 1.5427, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.6103646833013435, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 1.6990291262135926e-05, |
|
"loss": 1.4921, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.6116442738323736, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 1.650485436893204e-05, |
|
"loss": 1.4226, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.6129238643634037, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 1.6019417475728158e-05, |
|
"loss": 1.4878, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.6142034548944337, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 1.5533980582524273e-05, |
|
"loss": 1.5165, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6142034548944337, |
|
"eval_loss": 1.4879465103149414, |
|
"eval_runtime": 103.7919, |
|
"eval_samples_per_second": 48.173, |
|
"eval_steps_per_second": 1.513, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6154830454254638, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.5048543689320387e-05, |
|
"loss": 1.5134, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.6167626359564939, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 1.4563106796116505e-05, |
|
"loss": 1.4587, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.6180422264875239, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 1.4077669902912621e-05, |
|
"loss": 1.4203, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.619321817018554, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.3592233009708738e-05, |
|
"loss": 1.4245, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.6206014075495841, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 1.3106796116504854e-05, |
|
"loss": 1.513, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6218809980806143, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 1.2621359223300972e-05, |
|
"loss": 1.5439, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.6231605886116443, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 1.2135922330097088e-05, |
|
"loss": 1.5063, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.6244401791426744, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 1.1650485436893204e-05, |
|
"loss": 1.4927, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.6257197696737045, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.116504854368932e-05, |
|
"loss": 1.4943, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.6269993602047345, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 1.0679611650485437e-05, |
|
"loss": 1.5125, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6282789507357646, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 1.0194174757281553e-05, |
|
"loss": 1.4396, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.6295585412667947, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 9.70873786407767e-06, |
|
"loss": 1.4928, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.6308381317978247, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 9.223300970873788e-06, |
|
"loss": 1.4664, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.6321177223288548, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 8.737864077669904e-06, |
|
"loss": 1.4305, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.6333973128598849, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 8.25242718446602e-06, |
|
"loss": 1.5016, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6346769033909149, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 7.766990291262136e-06, |
|
"loss": 1.4641, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.635956493921945, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 7.281553398058253e-06, |
|
"loss": 1.497, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.6372360844529751, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 6.796116504854369e-06, |
|
"loss": 1.54, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.6385156749840051, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 6.310679611650486e-06, |
|
"loss": 1.4769, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.6397952655150352, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 5.825242718446602e-06, |
|
"loss": 1.5592, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6397952655150352, |
|
"eval_loss": 1.4878435134887695, |
|
"eval_runtime": 103.7542, |
|
"eval_samples_per_second": 48.191, |
|
"eval_steps_per_second": 1.513, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6410748560460653, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 5.3398058252427185e-06, |
|
"loss": 1.4929, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.6423544465770953, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 4.854368932038835e-06, |
|
"loss": 1.4825, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.6436340371081254, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 4.368932038834952e-06, |
|
"loss": 1.5464, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.6449136276391555, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 3.883495145631068e-06, |
|
"loss": 1.4797, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.6461932181701855, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 3.3980582524271844e-06, |
|
"loss": 1.5039, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.6474728087012156, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 2.912621359223301e-06, |
|
"loss": 1.4895, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.6487523992322457, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 2.4271844660194174e-06, |
|
"loss": 1.4887, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.6500319897632757, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 1.941747572815534e-06, |
|
"loss": 1.4734, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.6513115802943058, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 1.4563106796116506e-06, |
|
"loss": 1.4636, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.6525911708253359, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.70873786407767e-07, |
|
"loss": 1.5035, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.653870761356366, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 4.854368932038835e-07, |
|
"loss": 1.5264, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.655150351887396, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0, |
|
"loss": 1.5441, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.655150351887396, |
|
"step": 512, |
|
"total_flos": 3.6685588640169984e+17, |
|
"train_loss": 1.622293347492814, |
|
"train_runtime": 4726.6516, |
|
"train_samples_per_second": 6.933, |
|
"train_steps_per_second": 0.108 |
|
}, |
|
{ |
|
"epoch": 0.655150351887396, |
|
"eval_loss": 1.4878435134887695, |
|
"eval_runtime": 103.7194, |
|
"eval_samples_per_second": 48.207, |
|
"eval_steps_per_second": 1.514, |
|
"step": 512 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 512, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 3.6685588640169984e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|