|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.612048192771084, |
|
"eval_steps": 500, |
|
"global_step": 343, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0963855421686747, |
|
"grad_norm": 1.7634485960006714, |
|
"learning_rate": 4.998814299283415e-05, |
|
"loss": 0.8996, |
|
"num_input_tokens_seen": 78528, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1927710843373494, |
|
"grad_norm": 1.3068124055862427, |
|
"learning_rate": 4.995258321842611e-05, |
|
"loss": 0.6806, |
|
"num_input_tokens_seen": 159120, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2891566265060241, |
|
"grad_norm": 1.2104840278625488, |
|
"learning_rate": 4.989335440737586e-05, |
|
"loss": 0.618, |
|
"num_input_tokens_seen": 223552, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3855421686746988, |
|
"grad_norm": 1.4112542867660522, |
|
"learning_rate": 4.98105127417984e-05, |
|
"loss": 0.5594, |
|
"num_input_tokens_seen": 290944, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 0.9026587605476379, |
|
"learning_rate": 4.9704136802031485e-05, |
|
"loss": 0.5253, |
|
"num_input_tokens_seen": 364064, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5783132530120482, |
|
"grad_norm": 0.9427546858787537, |
|
"learning_rate": 4.957432749209755e-05, |
|
"loss": 0.4794, |
|
"num_input_tokens_seen": 440176, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6746987951807228, |
|
"grad_norm": 1.0594468116760254, |
|
"learning_rate": 4.942120794399002e-05, |
|
"loss": 0.4546, |
|
"num_input_tokens_seen": 517184, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.7710843373493976, |
|
"grad_norm": 0.9458279013633728, |
|
"learning_rate": 4.9244923400875245e-05, |
|
"loss": 0.4703, |
|
"num_input_tokens_seen": 591424, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8674698795180723, |
|
"grad_norm": 1.1610336303710938, |
|
"learning_rate": 4.9045641079320484e-05, |
|
"loss": 0.4407, |
|
"num_input_tokens_seen": 662784, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 1.0153354406356812, |
|
"learning_rate": 4.882355001067892e-05, |
|
"loss": 0.4425, |
|
"num_input_tokens_seen": 734784, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0602409638554218, |
|
"grad_norm": 1.0889695882797241, |
|
"learning_rate": 4.857886086178194e-05, |
|
"loss": 0.4081, |
|
"num_input_tokens_seen": 808336, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.1566265060240963, |
|
"grad_norm": 0.9168598055839539, |
|
"learning_rate": 4.8311805735108894e-05, |
|
"loss": 0.4002, |
|
"num_input_tokens_seen": 882672, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.2530120481927711, |
|
"grad_norm": 0.8168660998344421, |
|
"learning_rate": 4.802263794862385e-05, |
|
"loss": 0.3587, |
|
"num_input_tokens_seen": 947680, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.3493975903614457, |
|
"grad_norm": 1.0652003288269043, |
|
"learning_rate": 4.7711631795488096e-05, |
|
"loss": 0.356, |
|
"num_input_tokens_seen": 1022112, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.4457831325301205, |
|
"grad_norm": 1.1781517267227173, |
|
"learning_rate": 4.7379082283876566e-05, |
|
"loss": 0.3639, |
|
"num_input_tokens_seen": 1091744, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.5421686746987953, |
|
"grad_norm": 1.0550976991653442, |
|
"learning_rate": 4.702530485714461e-05, |
|
"loss": 0.3288, |
|
"num_input_tokens_seen": 1163728, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.6385542168674698, |
|
"grad_norm": 1.3946661949157715, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 0.3563, |
|
"num_input_tokens_seen": 1245728, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.7349397590361446, |
|
"grad_norm": 1.1458536386489868, |
|
"learning_rate": 4.625542839324036e-05, |
|
"loss": 0.3642, |
|
"num_input_tokens_seen": 1315056, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.8313253012048194, |
|
"grad_norm": 1.0227209329605103, |
|
"learning_rate": 4.584005963052799e-05, |
|
"loss": 0.3407, |
|
"num_input_tokens_seen": 1392224, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.927710843373494, |
|
"grad_norm": 1.0699985027313232, |
|
"learning_rate": 4.540492280890555e-05, |
|
"loss": 0.3216, |
|
"num_input_tokens_seen": 1471008, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.0240963855421685, |
|
"grad_norm": 0.8573477268218994, |
|
"learning_rate": 4.4950430682006e-05, |
|
"loss": 0.3197, |
|
"num_input_tokens_seen": 1546912, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.1204819277108435, |
|
"grad_norm": 1.1516242027282715, |
|
"learning_rate": 4.447701436314176e-05, |
|
"loss": 0.2904, |
|
"num_input_tokens_seen": 1611328, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.216867469879518, |
|
"grad_norm": 1.0890793800354004, |
|
"learning_rate": 4.398512291636768e-05, |
|
"loss": 0.2498, |
|
"num_input_tokens_seen": 1682528, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.3132530120481927, |
|
"grad_norm": 1.3621636629104614, |
|
"learning_rate": 4.347522293051648e-05, |
|
"loss": 0.269, |
|
"num_input_tokens_seen": 1751856, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 1.338083028793335, |
|
"learning_rate": 4.294779807661105e-05, |
|
"loss": 0.2838, |
|
"num_input_tokens_seen": 1830288, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.5060240963855422, |
|
"grad_norm": 1.2083592414855957, |
|
"learning_rate": 4.2403348649073174e-05, |
|
"loss": 0.2466, |
|
"num_input_tokens_seen": 1905296, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.602409638554217, |
|
"grad_norm": 1.35024094581604, |
|
"learning_rate": 4.184239109116393e-05, |
|
"loss": 0.2272, |
|
"num_input_tokens_seen": 1974464, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.6987951807228914, |
|
"grad_norm": 1.3738912343978882, |
|
"learning_rate": 4.126545750510605e-05, |
|
"loss": 0.2484, |
|
"num_input_tokens_seen": 2058176, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.7951807228915664, |
|
"grad_norm": 1.5877448320388794, |
|
"learning_rate": 4.067309514735267e-05, |
|
"loss": 0.2339, |
|
"num_input_tokens_seen": 2124912, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.891566265060241, |
|
"grad_norm": 1.3735121488571167, |
|
"learning_rate": 4.0065865909481417e-05, |
|
"loss": 0.2597, |
|
"num_input_tokens_seen": 2213456, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.9879518072289155, |
|
"grad_norm": 1.6480368375778198, |
|
"learning_rate": 3.9444345785206285e-05, |
|
"loss": 0.2525, |
|
"num_input_tokens_seen": 2281680, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.0843373493975905, |
|
"grad_norm": 1.2931358814239502, |
|
"learning_rate": 3.880912432401265e-05, |
|
"loss": 0.1832, |
|
"num_input_tokens_seen": 2349408, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.180722891566265, |
|
"grad_norm": 1.4131468534469604, |
|
"learning_rate": 3.81608040719339e-05, |
|
"loss": 0.1519, |
|
"num_input_tokens_seen": 2425456, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.2771084337349397, |
|
"grad_norm": 1.6228159666061401, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.1707, |
|
"num_input_tokens_seen": 2494064, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.3734939759036147, |
|
"grad_norm": 1.1356842517852783, |
|
"learning_rate": 3.6827338920900254e-05, |
|
"loss": 0.1603, |
|
"num_input_tokens_seen": 2573616, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.4698795180722892, |
|
"grad_norm": 1.3535553216934204, |
|
"learning_rate": 3.6143458894413465e-05, |
|
"loss": 0.1683, |
|
"num_input_tokens_seen": 2657744, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.566265060240964, |
|
"grad_norm": 1.3832409381866455, |
|
"learning_rate": 3.544900862216959e-05, |
|
"loss": 0.1734, |
|
"num_input_tokens_seen": 2721200, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.662650602409639, |
|
"grad_norm": 1.6430705785751343, |
|
"learning_rate": 3.474464683231698e-05, |
|
"loss": 0.1543, |
|
"num_input_tokens_seen": 2798320, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.7590361445783134, |
|
"grad_norm": 1.7706836462020874, |
|
"learning_rate": 3.403104165467883e-05, |
|
"loss": 0.1601, |
|
"num_input_tokens_seen": 2879200, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.855421686746988, |
|
"grad_norm": 1.7721610069274902, |
|
"learning_rate": 3.330886998699149e-05, |
|
"loss": 0.1911, |
|
"num_input_tokens_seen": 2947024, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.9518072289156625, |
|
"grad_norm": 1.666278600692749, |
|
"learning_rate": 3.257881685282609e-05, |
|
"loss": 0.1741, |
|
"num_input_tokens_seen": 3016656, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 4.048192771084337, |
|
"grad_norm": 1.099639892578125, |
|
"learning_rate": 3.1841574751802076e-05, |
|
"loss": 0.1334, |
|
"num_input_tokens_seen": 3084416, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.144578313253012, |
|
"grad_norm": 1.5020925998687744, |
|
"learning_rate": 3.109784300270943e-05, |
|
"loss": 0.1027, |
|
"num_input_tokens_seen": 3166784, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 4.240963855421687, |
|
"grad_norm": 2.203794240951538, |
|
"learning_rate": 3.0348327080162435e-05, |
|
"loss": 0.0955, |
|
"num_input_tokens_seen": 3239584, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.337349397590361, |
|
"grad_norm": 1.7183223962783813, |
|
"learning_rate": 2.9593737945414264e-05, |
|
"loss": 0.1006, |
|
"num_input_tokens_seen": 3313360, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.433734939759036, |
|
"grad_norm": 1.4102908372879028, |
|
"learning_rate": 2.8834791371967142e-05, |
|
"loss": 0.1007, |
|
"num_input_tokens_seen": 3377840, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.530120481927711, |
|
"grad_norm": 1.214020013809204, |
|
"learning_rate": 2.8072207266617855e-05, |
|
"loss": 0.1033, |
|
"num_input_tokens_seen": 3455904, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 4.626506024096385, |
|
"grad_norm": 1.5255635976791382, |
|
"learning_rate": 2.7306708986582553e-05, |
|
"loss": 0.1023, |
|
"num_input_tokens_seen": 3529360, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.72289156626506, |
|
"grad_norm": 1.6624009609222412, |
|
"learning_rate": 2.653902265334858e-05, |
|
"loss": 0.1121, |
|
"num_input_tokens_seen": 3605344, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 4.8192771084337345, |
|
"grad_norm": 1.7999521493911743, |
|
"learning_rate": 2.5769876463904265e-05, |
|
"loss": 0.1028, |
|
"num_input_tokens_seen": 3678352, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.9156626506024095, |
|
"grad_norm": 2.1297786235809326, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.1055, |
|
"num_input_tokens_seen": 3752608, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 5.0120481927710845, |
|
"grad_norm": 1.215146780014038, |
|
"learning_rate": 2.4230123536095748e-05, |
|
"loss": 0.1037, |
|
"num_input_tokens_seen": 3819744, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.108433734939759, |
|
"grad_norm": 1.448801040649414, |
|
"learning_rate": 2.346097734665143e-05, |
|
"loss": 0.0633, |
|
"num_input_tokens_seen": 3896592, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 5.204819277108434, |
|
"grad_norm": 1.220989465713501, |
|
"learning_rate": 2.2693291013417453e-05, |
|
"loss": 0.0521, |
|
"num_input_tokens_seen": 3970976, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.301204819277109, |
|
"grad_norm": 1.3077821731567383, |
|
"learning_rate": 2.192779273338215e-05, |
|
"loss": 0.0625, |
|
"num_input_tokens_seen": 4051760, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.397590361445783, |
|
"grad_norm": 2.02695369720459, |
|
"learning_rate": 2.116520862803286e-05, |
|
"loss": 0.059, |
|
"num_input_tokens_seen": 4124096, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.493975903614458, |
|
"grad_norm": 1.6377320289611816, |
|
"learning_rate": 2.0406262054585738e-05, |
|
"loss": 0.0648, |
|
"num_input_tokens_seen": 4188448, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 5.590361445783133, |
|
"grad_norm": 1.6187361478805542, |
|
"learning_rate": 1.965167291983757e-05, |
|
"loss": 0.0709, |
|
"num_input_tokens_seen": 4261056, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.686746987951807, |
|
"grad_norm": 1.4855268001556396, |
|
"learning_rate": 1.890215699729057e-05, |
|
"loss": 0.0641, |
|
"num_input_tokens_seen": 4329024, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 5.783132530120482, |
|
"grad_norm": 1.4216831922531128, |
|
"learning_rate": 1.815842524819793e-05, |
|
"loss": 0.0689, |
|
"num_input_tokens_seen": 4406624, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.879518072289157, |
|
"grad_norm": 1.7383759021759033, |
|
"learning_rate": 1.7421183147173915e-05, |
|
"loss": 0.055, |
|
"num_input_tokens_seen": 4480352, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 5.975903614457831, |
|
"grad_norm": 1.5599803924560547, |
|
"learning_rate": 1.6691130013008514e-05, |
|
"loss": 0.0626, |
|
"num_input_tokens_seen": 4554080, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.072289156626506, |
|
"grad_norm": 1.028124213218689, |
|
"learning_rate": 1.5968958345321178e-05, |
|
"loss": 0.0465, |
|
"num_input_tokens_seen": 4628576, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 6.168674698795181, |
|
"grad_norm": 1.4686311483383179, |
|
"learning_rate": 1.5255353167683017e-05, |
|
"loss": 0.0421, |
|
"num_input_tokens_seen": 4704512, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.265060240963855, |
|
"grad_norm": 1.1644634008407593, |
|
"learning_rate": 1.4550991377830426e-05, |
|
"loss": 0.0303, |
|
"num_input_tokens_seen": 4776912, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 6.36144578313253, |
|
"grad_norm": 1.090997338294983, |
|
"learning_rate": 1.3856541105586545e-05, |
|
"loss": 0.0337, |
|
"num_input_tokens_seen": 4855600, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.457831325301205, |
|
"grad_norm": 1.4336110353469849, |
|
"learning_rate": 1.3172661079099752e-05, |
|
"loss": 0.0333, |
|
"num_input_tokens_seen": 4927600, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 6.554216867469879, |
|
"grad_norm": 1.3488271236419678, |
|
"learning_rate": 1.2500000000000006e-05, |
|
"loss": 0.039, |
|
"num_input_tokens_seen": 5003184, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.612048192771084, |
|
"num_input_tokens_seen": 5052448, |
|
"step": 343, |
|
"total_flos": 2.28822660837802e+17, |
|
"train_loss": 0.2300173058541106, |
|
"train_runtime": 12675.9679, |
|
"train_samples_per_second": 0.655, |
|
"train_steps_per_second": 0.04 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 510, |
|
"num_input_tokens_seen": 5052448, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.28822660837802e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|