|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.996154016179656, |
|
"eval_steps": 500, |
|
"global_step": 1059, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02829229477034614, |
|
"grad_norm": 94.11492156982422, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.8265, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05658458954069228, |
|
"grad_norm": 75.07470703125, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.737, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08487688431103842, |
|
"grad_norm": 63.67890930175781, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.7199, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11316917908138456, |
|
"grad_norm": 64.5537109375, |
|
"learning_rate": 3.9994011512354455e-05, |
|
"loss": 0.7145, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1414614738517307, |
|
"grad_norm": 63.26422119140625, |
|
"learning_rate": 3.996968942743186e-05, |
|
"loss": 0.6906, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16975376862207683, |
|
"grad_norm": 52.657257080078125, |
|
"learning_rate": 3.9926682204807304e-05, |
|
"loss": 0.6766, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.19804606339242298, |
|
"grad_norm": 55.939781188964844, |
|
"learning_rate": 3.986503008526067e-05, |
|
"loss": 0.6663, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.22633835816276912, |
|
"grad_norm": 49.46023941040039, |
|
"learning_rate": 3.978479075513446e-05, |
|
"loss": 0.6651, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.25463065293311526, |
|
"grad_norm": 53.444976806640625, |
|
"learning_rate": 3.968603929235803e-05, |
|
"loss": 0.6687, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2829229477034614, |
|
"grad_norm": 49.93338394165039, |
|
"learning_rate": 3.956886809619913e-05, |
|
"loss": 0.6596, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3112152424738075, |
|
"grad_norm": 54.019474029541016, |
|
"learning_rate": 3.94333868008082e-05, |
|
"loss": 0.6334, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.33950753724415367, |
|
"grad_norm": 49.52291488647461, |
|
"learning_rate": 3.927972217263646e-05, |
|
"loss": 0.636, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3677998320144998, |
|
"grad_norm": 46.9410400390625, |
|
"learning_rate": 3.9108017991823674e-05, |
|
"loss": 0.6273, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.39609212678484595, |
|
"grad_norm": 48.465641021728516, |
|
"learning_rate": 3.891843491766674e-05, |
|
"loss": 0.6225, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.42438442155519207, |
|
"grad_norm": 47.92714309692383, |
|
"learning_rate": 3.871115033829478e-05, |
|
"loss": 0.6358, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.45267671632553824, |
|
"grad_norm": 47.684757232666016, |
|
"learning_rate": 3.8486358204691505e-05, |
|
"loss": 0.6201, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.48096901109588436, |
|
"grad_norm": 48.77947235107422, |
|
"learning_rate": 3.824426884922018e-05, |
|
"loss": 0.6139, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5092613058662305, |
|
"grad_norm": 49.815006256103516, |
|
"learning_rate": 3.798510878882086e-05, |
|
"loss": 0.6247, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5375536006365766, |
|
"grad_norm": 49.60624694824219, |
|
"learning_rate": 3.7709120513064196e-05, |
|
"loss": 0.6125, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5658458954069228, |
|
"grad_norm": 47.29824447631836, |
|
"learning_rate": 3.741656225726005e-05, |
|
"loss": 0.6129, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5941381901772689, |
|
"grad_norm": 50.14078903198242, |
|
"learning_rate": 3.710770776083311e-05, |
|
"loss": 0.5902, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.622430484947615, |
|
"grad_norm": 48.83024215698242, |
|
"learning_rate": 3.6782846011191855e-05, |
|
"loss": 0.6121, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6507227797179612, |
|
"grad_norm": 45.325504302978516, |
|
"learning_rate": 3.644228097333025e-05, |
|
"loss": 0.6009, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6790150744883073, |
|
"grad_norm": 51.53373718261719, |
|
"learning_rate": 3.60863313054153e-05, |
|
"loss": 0.6118, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7073073692586535, |
|
"grad_norm": 44.46132278442383, |
|
"learning_rate": 3.571533006062664e-05, |
|
"loss": 0.6042, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7355996640289996, |
|
"grad_norm": 47.890625, |
|
"learning_rate": 3.5329624375527e-05, |
|
"loss": 0.6029, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7638919587993458, |
|
"grad_norm": 45.87557601928711, |
|
"learning_rate": 3.492957514525522e-05, |
|
"loss": 0.5924, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7921842535696919, |
|
"grad_norm": 46.6375617980957, |
|
"learning_rate": 3.4515556685845706e-05, |
|
"loss": 0.5839, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.820476548340038, |
|
"grad_norm": 46.20927810668945, |
|
"learning_rate": 3.4087956383990355e-05, |
|
"loss": 0.5976, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8487688431103841, |
|
"grad_norm": 44.299476623535156, |
|
"learning_rate": 3.36471743345705e-05, |
|
"loss": 0.5871, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8770611378807303, |
|
"grad_norm": 44.8925895690918, |
|
"learning_rate": 3.3193622966298155e-05, |
|
"loss": 0.5729, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.9053534326510765, |
|
"grad_norm": 43.60881042480469, |
|
"learning_rate": 3.272772665581681e-05, |
|
"loss": 0.5951, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9336457274214226, |
|
"grad_norm": 44.73467254638672, |
|
"learning_rate": 3.2249921330622894e-05, |
|
"loss": 0.5865, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9619380221917687, |
|
"grad_norm": 44.39530563354492, |
|
"learning_rate": 3.176065406117928e-05, |
|
"loss": 0.5677, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9902303169621148, |
|
"grad_norm": 45.78221893310547, |
|
"learning_rate": 3.126038264260272e-05, |
|
"loss": 0.5811, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.018522611732461, |
|
"grad_norm": 42.54255676269531, |
|
"learning_rate": 3.0749575166316376e-05, |
|
"loss": 0.529, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0468149065028072, |
|
"grad_norm": 44.1485595703125, |
|
"learning_rate": 3.022870958206845e-05, |
|
"loss": 0.5104, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0751072012731533, |
|
"grad_norm": 43.59065628051758, |
|
"learning_rate": 2.9698273250726593e-05, |
|
"loss": 0.4912, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.1033994960434994, |
|
"grad_norm": 47.90654373168945, |
|
"learning_rate": 2.915876248826653e-05, |
|
"loss": 0.5007, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.1316917908138455, |
|
"grad_norm": 48.12900924682617, |
|
"learning_rate": 2.861068210138169e-05, |
|
"loss": 0.4924, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1599840855841916, |
|
"grad_norm": 45.788028717041016, |
|
"learning_rate": 2.8054544915148163e-05, |
|
"loss": 0.4999, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1882763803545378, |
|
"grad_norm": 48.016265869140625, |
|
"learning_rate": 2.7490871293187226e-05, |
|
"loss": 0.4968, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.2165686751248839, |
|
"grad_norm": 47.503753662109375, |
|
"learning_rate": 2.69201886507741e-05, |
|
"loss": 0.4911, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.24486096989523, |
|
"grad_norm": 43.9008674621582, |
|
"learning_rate": 2.6343030961348684e-05, |
|
"loss": 0.4914, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2731532646655763, |
|
"grad_norm": 51.55910110473633, |
|
"learning_rate": 2.575993825689005e-05, |
|
"loss": 0.4984, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3014455594359224, |
|
"grad_norm": 46.89332962036133, |
|
"learning_rate": 2.5171456122622006e-05, |
|
"loss": 0.4913, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.3297378542062686, |
|
"grad_norm": 45.62400817871094, |
|
"learning_rate": 2.4578135186522716e-05, |
|
"loss": 0.4908, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.3580301489766147, |
|
"grad_norm": 43.19487380981445, |
|
"learning_rate": 2.3980530604115896e-05, |
|
"loss": 0.4899, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3863224437469608, |
|
"grad_norm": 47.675411224365234, |
|
"learning_rate": 2.337920153902571e-05, |
|
"loss": 0.5022, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.414614738517307, |
|
"grad_norm": 46.81362533569336, |
|
"learning_rate": 2.277471063978137e-05, |
|
"loss": 0.497, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.442907033287653, |
|
"grad_norm": 43.024845123291016, |
|
"learning_rate": 2.2167623513361045e-05, |
|
"loss": 0.486, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.4711993280579991, |
|
"grad_norm": 48.46063995361328, |
|
"learning_rate": 2.1558508195967576e-05, |
|
"loss": 0.4925, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4994916228283452, |
|
"grad_norm": 45.760868072509766, |
|
"learning_rate": 2.0947934621531258e-05, |
|
"loss": 0.4904, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.5277839175986916, |
|
"grad_norm": 50.22765350341797, |
|
"learning_rate": 2.0336474088436904e-05, |
|
"loss": 0.4893, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.5560762123690375, |
|
"grad_norm": 46.728614807128906, |
|
"learning_rate": 1.9724698724974343e-05, |
|
"loss": 0.4863, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5843685071393838, |
|
"grad_norm": 47.76382064819336, |
|
"learning_rate": 1.9113180954012247e-05, |
|
"loss": 0.4944, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.61266080190973, |
|
"grad_norm": 49.77993392944336, |
|
"learning_rate": 1.8502492957396484e-05, |
|
"loss": 0.4774, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.640953096680076, |
|
"grad_norm": 46.18117904663086, |
|
"learning_rate": 1.7893206140573852e-05, |
|
"loss": 0.4694, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.6692453914504222, |
|
"grad_norm": 46.24747085571289, |
|
"learning_rate": 1.7285890597942336e-05, |
|
"loss": 0.4756, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6975376862207683, |
|
"grad_norm": 46.43058776855469, |
|
"learning_rate": 1.668111457942811e-05, |
|
"loss": 0.4783, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7258299809911144, |
|
"grad_norm": 46.958919525146484, |
|
"learning_rate": 1.607944395878828e-05, |
|
"loss": 0.4776, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.7541222757614605, |
|
"grad_norm": 42.604576110839844, |
|
"learning_rate": 1.548144170413705e-05, |
|
"loss": 0.4751, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7824145705318069, |
|
"grad_norm": 46.214046478271484, |
|
"learning_rate": 1.4887667351190508e-05, |
|
"loss": 0.4736, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.8107068653021527, |
|
"grad_norm": 46.578739166259766, |
|
"learning_rate": 1.4298676479723158e-05, |
|
"loss": 0.4765, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.838999160072499, |
|
"grad_norm": 49.75358581542969, |
|
"learning_rate": 1.3715020193725801e-05, |
|
"loss": 0.4639, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.867291454842845, |
|
"grad_norm": 49.36628341674805, |
|
"learning_rate": 1.3137244605751364e-05, |
|
"loss": 0.4865, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8955837496131913, |
|
"grad_norm": 49.271121978759766, |
|
"learning_rate": 1.2565890325931096e-05, |
|
"loss": 0.4635, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.9238760443835374, |
|
"grad_norm": 42.75962829589844, |
|
"learning_rate": 1.2001491956139177e-05, |
|
"loss": 0.4753, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.9521683391538835, |
|
"grad_norm": 44.49867630004883, |
|
"learning_rate": 1.1444577589779206e-05, |
|
"loss": 0.4657, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.9804606339242297, |
|
"grad_norm": 46.07305908203125, |
|
"learning_rate": 1.0895668317660404e-05, |
|
"loss": 0.4668, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.008752928694576, |
|
"grad_norm": 53.604557037353516, |
|
"learning_rate": 1.0355277740426017e-05, |
|
"loss": 0.444, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.037045223464922, |
|
"grad_norm": 52.05387878417969, |
|
"learning_rate": 9.823911487990083e-06, |
|
"loss": 0.3918, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.065337518235268, |
|
"grad_norm": 52.156349182128906, |
|
"learning_rate": 9.302066746432183e-06, |
|
"loss": 0.387, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.0936298130056143, |
|
"grad_norm": 51.72985076904297, |
|
"learning_rate": 8.790231792792914e-06, |
|
"loss": 0.3875, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.1219221077759602, |
|
"grad_norm": 53.109046936035156, |
|
"learning_rate": 8.28888553820526e-06, |
|
"loss": 0.3921, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.1502144025463066, |
|
"grad_norm": 51.85807418823242, |
|
"learning_rate": 7.798497079789513e-06, |
|
"loss": 0.3933, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.1785066973166525, |
|
"grad_norm": 53.21246337890625, |
|
"learning_rate": 7.319525261730822e-06, |
|
"loss": 0.3889, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.206798992086999, |
|
"grad_norm": 52.26942443847656, |
|
"learning_rate": 6.852418245950195e-06, |
|
"loss": 0.3938, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.235091286857345, |
|
"grad_norm": 52.105838775634766, |
|
"learning_rate": 6.397613092770641e-06, |
|
"loss": 0.3934, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.263383581627691, |
|
"grad_norm": 50.32993698120117, |
|
"learning_rate": 5.955535351970754e-06, |
|
"loss": 0.3884, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.2916758763980374, |
|
"grad_norm": 52.58189392089844, |
|
"learning_rate": 5.5265986646085e-06, |
|
"loss": 0.3894, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.3199681711683833, |
|
"grad_norm": 51.35353469848633, |
|
"learning_rate": 5.111204375987615e-06, |
|
"loss": 0.385, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.3482604659387296, |
|
"grad_norm": 52.65754318237305, |
|
"learning_rate": 4.709741160128878e-06, |
|
"loss": 0.372, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.3765527607090755, |
|
"grad_norm": 52.932281494140625, |
|
"learning_rate": 4.322584656097577e-06, |
|
"loss": 0.3782, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.404845055479422, |
|
"grad_norm": 48.66704177856445, |
|
"learning_rate": 3.950097116527449e-06, |
|
"loss": 0.3919, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.4331373502497677, |
|
"grad_norm": 52.752803802490234, |
|
"learning_rate": 3.592627068669983e-06, |
|
"loss": 0.3823, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.461429645020114, |
|
"grad_norm": 52.2065544128418, |
|
"learning_rate": 3.2505089882861964e-06, |
|
"loss": 0.3767, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.48972193979046, |
|
"grad_norm": 50.5092887878418, |
|
"learning_rate": 2.9240629866860538e-06, |
|
"loss": 0.3719, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.5180142345608063, |
|
"grad_norm": 54.0071907043457, |
|
"learning_rate": 2.6135945112083506e-06, |
|
"loss": 0.3793, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.5463065293311526, |
|
"grad_norm": 51.075164794921875, |
|
"learning_rate": 2.3193940594212917e-06, |
|
"loss": 0.3789, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.5745988241014985, |
|
"grad_norm": 53.48076629638672, |
|
"learning_rate": 2.0417369073111936e-06, |
|
"loss": 0.3817, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.602891118871845, |
|
"grad_norm": 54.31864929199219, |
|
"learning_rate": 1.7808828517136612e-06, |
|
"loss": 0.3857, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.6311834136421908, |
|
"grad_norm": 51.0897216796875, |
|
"learning_rate": 1.5370759672281744e-06, |
|
"loss": 0.3776, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.659475708412537, |
|
"grad_norm": 53.861427307128906, |
|
"learning_rate": 1.3105443778436388e-06, |
|
"loss": 0.3766, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.687768003182883, |
|
"grad_norm": 49.185302734375, |
|
"learning_rate": 1.1015000434884682e-06, |
|
"loss": 0.384, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.7160602979532293, |
|
"grad_norm": 53.21417999267578, |
|
"learning_rate": 9.101385617050007e-07, |
|
"loss": 0.3723, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.7443525927235752, |
|
"grad_norm": 54.70134353637695, |
|
"learning_rate": 7.366389846337841e-07, |
|
"loss": 0.3815, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.7726448874939216, |
|
"grad_norm": 53.2882194519043, |
|
"learning_rate": 5.811636514789598e-07, |
|
"loss": 0.3806, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.800937182264268, |
|
"grad_norm": 52.69584274291992, |
|
"learning_rate": 4.43858036611573e-07, |
|
"loss": 0.3814, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.829229477034614, |
|
"grad_norm": 55.783634185791016, |
|
"learning_rate": 3.2485061345282286e-07, |
|
"loss": 0.3838, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.85752177180496, |
|
"grad_norm": 49.773502349853516, |
|
"learning_rate": 2.2425273426471162e-07, |
|
"loss": 0.3829, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.885814066575306, |
|
"grad_norm": 50.421417236328125, |
|
"learning_rate": 1.421585259605318e-07, |
|
"loss": 0.3802, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.9141063613456524, |
|
"grad_norm": 56.082252502441406, |
|
"learning_rate": 7.864480203266356e-08, |
|
"loss": 0.3815, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.9423986561159983, |
|
"grad_norm": 52.35788345336914, |
|
"learning_rate": 3.37709906801198e-08, |
|
"loss": 0.3846, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.9706909508863446, |
|
"grad_norm": 51.95500564575195, |
|
"learning_rate": 7.579079203074991e-09, |
|
"loss": 0.3727, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.996154016179656, |
|
"step": 1059, |
|
"total_flos": 9.569143748664361e+17, |
|
"train_loss": 0.5005588545002275, |
|
"train_runtime": 18779.252, |
|
"train_samples_per_second": 7.227, |
|
"train_steps_per_second": 0.056 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1059, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.569143748664361e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|