|
{ |
|
"best_metric": 0.001362333190627396, |
|
"best_model_checkpoint": "/home/paperspace/Data/models/reliance/llm3br256/checkpoint-1200", |
|
"epoch": 10.728744939271255, |
|
"eval_steps": 25, |
|
"global_step": 1325, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008097165991902834, |
|
"grad_norm": 0.08275424689054489, |
|
"learning_rate": 3.246753246753247e-07, |
|
"loss": 0.0308, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.016194331983805668, |
|
"grad_norm": 0.07216893136501312, |
|
"learning_rate": 6.493506493506494e-07, |
|
"loss": 0.023, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.024291497975708502, |
|
"grad_norm": 0.07287438213825226, |
|
"learning_rate": 9.74025974025974e-07, |
|
"loss": 0.0238, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.032388663967611336, |
|
"grad_norm": 0.08366404473781586, |
|
"learning_rate": 1.2987012987012988e-06, |
|
"loss": 0.0231, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.04048582995951417, |
|
"grad_norm": 0.06945216655731201, |
|
"learning_rate": 1.6233766233766232e-06, |
|
"loss": 0.0274, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.048582995951417005, |
|
"grad_norm": 0.0757410079240799, |
|
"learning_rate": 1.948051948051948e-06, |
|
"loss": 0.0254, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05668016194331984, |
|
"grad_norm": 0.07382987439632416, |
|
"learning_rate": 2.2727272727272728e-06, |
|
"loss": 0.0265, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.06477732793522267, |
|
"grad_norm": 0.06420081108808517, |
|
"learning_rate": 2.5974025974025976e-06, |
|
"loss": 0.0253, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0728744939271255, |
|
"grad_norm": 0.06388016045093536, |
|
"learning_rate": 2.922077922077922e-06, |
|
"loss": 0.0246, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08097165991902834, |
|
"grad_norm": 0.06586486101150513, |
|
"learning_rate": 3.2467532467532465e-06, |
|
"loss": 0.0237, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08906882591093117, |
|
"grad_norm": 0.057881902903318405, |
|
"learning_rate": 3.5714285714285714e-06, |
|
"loss": 0.0223, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.09716599190283401, |
|
"grad_norm": 0.0731276124715805, |
|
"learning_rate": 3.896103896103896e-06, |
|
"loss": 0.0221, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 0.04406806081533432, |
|
"learning_rate": 4.220779220779221e-06, |
|
"loss": 0.0195, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.11336032388663968, |
|
"grad_norm": 0.0436287596821785, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.0168, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1214574898785425, |
|
"grad_norm": 0.047254908829927444, |
|
"learning_rate": 4.870129870129871e-06, |
|
"loss": 0.0224, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12955465587044535, |
|
"grad_norm": 0.041482556611299515, |
|
"learning_rate": 5.194805194805195e-06, |
|
"loss": 0.0193, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13765182186234817, |
|
"grad_norm": 0.04704615846276283, |
|
"learning_rate": 5.51948051948052e-06, |
|
"loss": 0.0219, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.145748987854251, |
|
"grad_norm": 0.04699448496103287, |
|
"learning_rate": 5.844155844155844e-06, |
|
"loss": 0.0218, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 0.049021102488040924, |
|
"learning_rate": 6.168831168831169e-06, |
|
"loss": 0.0243, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.16194331983805668, |
|
"grad_norm": 0.03877793252468109, |
|
"learning_rate": 6.493506493506493e-06, |
|
"loss": 0.0167, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1700404858299595, |
|
"grad_norm": 0.041388873010873795, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 0.0172, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.17813765182186234, |
|
"grad_norm": 0.04476911574602127, |
|
"learning_rate": 7.142857142857143e-06, |
|
"loss": 0.0215, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1862348178137652, |
|
"grad_norm": 0.03552476316690445, |
|
"learning_rate": 7.467532467532468e-06, |
|
"loss": 0.0179, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.19433198380566802, |
|
"grad_norm": 0.03406437113881111, |
|
"learning_rate": 7.792207792207792e-06, |
|
"loss": 0.0207, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.20242914979757085, |
|
"grad_norm": 0.030436363071203232, |
|
"learning_rate": 8.116883116883117e-06, |
|
"loss": 0.0197, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.20242914979757085, |
|
"eval_loss": 0.017658039927482605, |
|
"eval_runtime": 22.7487, |
|
"eval_samples_per_second": 4.396, |
|
"eval_steps_per_second": 1.099, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 0.030306411907076836, |
|
"learning_rate": 8.441558441558442e-06, |
|
"loss": 0.0148, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.21862348178137653, |
|
"grad_norm": 0.02774702198803425, |
|
"learning_rate": 8.766233766233767e-06, |
|
"loss": 0.0168, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.22672064777327935, |
|
"grad_norm": 0.026585258543491364, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 0.0115, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.23481781376518218, |
|
"grad_norm": 0.026557059958577156, |
|
"learning_rate": 9.415584415584416e-06, |
|
"loss": 0.0154, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.242914979757085, |
|
"grad_norm": 0.026998251676559448, |
|
"learning_rate": 9.740259740259742e-06, |
|
"loss": 0.014, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.25101214574898784, |
|
"grad_norm": 0.027236543595790863, |
|
"learning_rate": 1.0064935064935065e-05, |
|
"loss": 0.0152, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2591093117408907, |
|
"grad_norm": 0.029114605858922005, |
|
"learning_rate": 1.038961038961039e-05, |
|
"loss": 0.0157, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.26720647773279355, |
|
"grad_norm": 0.02437474951148033, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 0.0163, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.27530364372469635, |
|
"grad_norm": 0.023844681680202484, |
|
"learning_rate": 1.103896103896104e-05, |
|
"loss": 0.0147, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2834008097165992, |
|
"grad_norm": 0.021733107045292854, |
|
"learning_rate": 1.1363636363636365e-05, |
|
"loss": 0.0164, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.291497975708502, |
|
"grad_norm": 0.022121932357549667, |
|
"learning_rate": 1.1688311688311688e-05, |
|
"loss": 0.0142, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.29959514170040485, |
|
"grad_norm": 0.020241033285856247, |
|
"learning_rate": 1.2012987012987014e-05, |
|
"loss": 0.015, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.02085702121257782, |
|
"learning_rate": 1.2337662337662339e-05, |
|
"loss": 0.0147, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 0.023749109357595444, |
|
"learning_rate": 1.2662337662337662e-05, |
|
"loss": 0.0156, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.32388663967611336, |
|
"grad_norm": 0.02149099111557007, |
|
"learning_rate": 1.2987012987012986e-05, |
|
"loss": 0.0125, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3319838056680162, |
|
"grad_norm": 0.020449506118893623, |
|
"learning_rate": 1.3311688311688311e-05, |
|
"loss": 0.0107, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.340080971659919, |
|
"grad_norm": 0.020927896723151207, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 0.0119, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3481781376518219, |
|
"grad_norm": 0.018237633630633354, |
|
"learning_rate": 1.396103896103896e-05, |
|
"loss": 0.0109, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3562753036437247, |
|
"grad_norm": 0.019094541668891907, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 0.0113, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3643724696356275, |
|
"grad_norm": 0.020349925383925438, |
|
"learning_rate": 1.461038961038961e-05, |
|
"loss": 0.018, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3724696356275304, |
|
"grad_norm": 0.017563968896865845, |
|
"learning_rate": 1.4935064935064936e-05, |
|
"loss": 0.0105, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3805668016194332, |
|
"grad_norm": 0.020637603476643562, |
|
"learning_rate": 1.525974025974026e-05, |
|
"loss": 0.0111, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.38866396761133604, |
|
"grad_norm": 0.01847653090953827, |
|
"learning_rate": 1.5584415584415583e-05, |
|
"loss": 0.0104, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3967611336032389, |
|
"grad_norm": 0.019373638555407524, |
|
"learning_rate": 1.590909090909091e-05, |
|
"loss": 0.0122, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4048582995951417, |
|
"grad_norm": 0.01981317810714245, |
|
"learning_rate": 1.6233766233766234e-05, |
|
"loss": 0.0118, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4048582995951417, |
|
"eval_loss": 0.011365661397576332, |
|
"eval_runtime": 20.8684, |
|
"eval_samples_per_second": 4.792, |
|
"eval_steps_per_second": 1.198, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.41295546558704455, |
|
"grad_norm": 0.024859532713890076, |
|
"learning_rate": 1.655844155844156e-05, |
|
"loss": 0.013, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 0.02114563249051571, |
|
"learning_rate": 1.6883116883116884e-05, |
|
"loss": 0.0101, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4291497975708502, |
|
"grad_norm": 0.017897600308060646, |
|
"learning_rate": 1.7207792207792208e-05, |
|
"loss": 0.0102, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.43724696356275305, |
|
"grad_norm": 0.0172622948884964, |
|
"learning_rate": 1.7532467532467535e-05, |
|
"loss": 0.009, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.44534412955465585, |
|
"grad_norm": 0.01602226495742798, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 0.0094, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4534412955465587, |
|
"grad_norm": 0.018682394176721573, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.0101, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 0.016763649880886078, |
|
"learning_rate": 1.850649350649351e-05, |
|
"loss": 0.0091, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.46963562753036436, |
|
"grad_norm": 0.021187469363212585, |
|
"learning_rate": 1.8831168831168833e-05, |
|
"loss": 0.0085, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4777327935222672, |
|
"grad_norm": 0.01601949706673622, |
|
"learning_rate": 1.9155844155844156e-05, |
|
"loss": 0.0069, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.48582995951417, |
|
"grad_norm": 0.012528536841273308, |
|
"learning_rate": 1.9480519480519483e-05, |
|
"loss": 0.0083, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4939271255060729, |
|
"grad_norm": 0.019854655489325523, |
|
"learning_rate": 1.9805194805194807e-05, |
|
"loss": 0.0118, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5020242914979757, |
|
"grad_norm": 0.016604121774435043, |
|
"learning_rate": 2.012987012987013e-05, |
|
"loss": 0.0078, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.5101214574898786, |
|
"grad_norm": 0.017011208459734917, |
|
"learning_rate": 2.0454545454545457e-05, |
|
"loss": 0.0096, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5182186234817814, |
|
"grad_norm": 0.017113033682107925, |
|
"learning_rate": 2.077922077922078e-05, |
|
"loss": 0.0121, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 0.014709901064634323, |
|
"learning_rate": 2.1103896103896105e-05, |
|
"loss": 0.0066, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5344129554655871, |
|
"grad_norm": 0.01747279427945614, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 0.0071, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5425101214574899, |
|
"grad_norm": 0.01678309217095375, |
|
"learning_rate": 2.1753246753246752e-05, |
|
"loss": 0.0061, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5506072874493927, |
|
"grad_norm": 0.014886225573718548, |
|
"learning_rate": 2.207792207792208e-05, |
|
"loss": 0.007, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5587044534412956, |
|
"grad_norm": 0.017061002552509308, |
|
"learning_rate": 2.2402597402597402e-05, |
|
"loss": 0.0094, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5668016194331984, |
|
"grad_norm": 0.014715418219566345, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 0.0066, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5748987854251012, |
|
"grad_norm": 0.018518812954425812, |
|
"learning_rate": 2.3051948051948053e-05, |
|
"loss": 0.01, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.582995951417004, |
|
"grad_norm": 0.020052259787917137, |
|
"learning_rate": 2.3376623376623376e-05, |
|
"loss": 0.011, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5910931174089069, |
|
"grad_norm": 0.01645250990986824, |
|
"learning_rate": 2.3701298701298703e-05, |
|
"loss": 0.0052, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5991902834008097, |
|
"grad_norm": 0.015539892017841339, |
|
"learning_rate": 2.4025974025974027e-05, |
|
"loss": 0.0096, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6072874493927125, |
|
"grad_norm": 0.017328433692455292, |
|
"learning_rate": 2.435064935064935e-05, |
|
"loss": 0.0103, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6072874493927125, |
|
"eval_loss": 0.007956410758197308, |
|
"eval_runtime": 20.8871, |
|
"eval_samples_per_second": 4.788, |
|
"eval_steps_per_second": 1.197, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.016708405688405037, |
|
"learning_rate": 2.4675324675324678e-05, |
|
"loss": 0.0079, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6234817813765182, |
|
"grad_norm": 0.018906861543655396, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0072, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 0.017756767570972443, |
|
"learning_rate": 2.5324675324675325e-05, |
|
"loss": 0.0094, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6396761133603239, |
|
"grad_norm": 0.016792573034763336, |
|
"learning_rate": 2.5649350649350652e-05, |
|
"loss": 0.0085, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6477732793522267, |
|
"grad_norm": 0.016278818249702454, |
|
"learning_rate": 2.5974025974025972e-05, |
|
"loss": 0.0057, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6558704453441295, |
|
"grad_norm": 0.015400170348584652, |
|
"learning_rate": 2.62987012987013e-05, |
|
"loss": 0.0075, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6639676113360324, |
|
"grad_norm": 0.012865799479186535, |
|
"learning_rate": 2.6623376623376623e-05, |
|
"loss": 0.0063, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6720647773279352, |
|
"grad_norm": 0.014955022372305393, |
|
"learning_rate": 2.694805194805195e-05, |
|
"loss": 0.0093, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.680161943319838, |
|
"grad_norm": 0.015082084573805332, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 0.0051, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6882591093117408, |
|
"grad_norm": 0.01421983353793621, |
|
"learning_rate": 2.75974025974026e-05, |
|
"loss": 0.0056, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6963562753036437, |
|
"grad_norm": 0.017437629401683807, |
|
"learning_rate": 2.792207792207792e-05, |
|
"loss": 0.0075, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.7044534412955465, |
|
"grad_norm": 0.19036760926246643, |
|
"learning_rate": 2.824675324675325e-05, |
|
"loss": 0.0085, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.7125506072874493, |
|
"grad_norm": 0.013543471693992615, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.0055, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.7206477732793523, |
|
"grad_norm": 0.029237190261483192, |
|
"learning_rate": 2.8896103896103898e-05, |
|
"loss": 0.0049, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.728744939271255, |
|
"grad_norm": 0.017158357426524162, |
|
"learning_rate": 2.922077922077922e-05, |
|
"loss": 0.0061, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7368421052631579, |
|
"grad_norm": 0.01913885958492756, |
|
"learning_rate": 2.954545454545455e-05, |
|
"loss": 0.0074, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7449392712550608, |
|
"grad_norm": 0.037916868925094604, |
|
"learning_rate": 2.9870129870129872e-05, |
|
"loss": 0.0081, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7530364372469636, |
|
"grad_norm": 0.018052248284220695, |
|
"learning_rate": 3.01948051948052e-05, |
|
"loss": 0.0075, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7611336032388664, |
|
"grad_norm": 0.01774253509938717, |
|
"learning_rate": 3.051948051948052e-05, |
|
"loss": 0.0091, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.019465815275907516, |
|
"learning_rate": 3.084415584415585e-05, |
|
"loss": 0.0083, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7773279352226721, |
|
"grad_norm": 0.01778685301542282, |
|
"learning_rate": 3.1168831168831166e-05, |
|
"loss": 0.0061, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7854251012145749, |
|
"grad_norm": 0.017645837739109993, |
|
"learning_rate": 3.14935064935065e-05, |
|
"loss": 0.0088, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7935222672064778, |
|
"grad_norm": 0.013044299557805061, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 0.0046, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.8016194331983806, |
|
"grad_norm": 0.015588215552270412, |
|
"learning_rate": 3.2142857142857144e-05, |
|
"loss": 0.0048, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8097165991902834, |
|
"grad_norm": 0.014032929204404354, |
|
"learning_rate": 3.246753246753247e-05, |
|
"loss": 0.0091, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8097165991902834, |
|
"eval_loss": 0.007168960757553577, |
|
"eval_runtime": 20.8882, |
|
"eval_samples_per_second": 4.787, |
|
"eval_steps_per_second": 1.197, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8178137651821862, |
|
"grad_norm": 0.013353945687413216, |
|
"learning_rate": 3.27922077922078e-05, |
|
"loss": 0.0049, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.8259109311740891, |
|
"grad_norm": 0.012625321745872498, |
|
"learning_rate": 3.311688311688312e-05, |
|
"loss": 0.0045, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.8340080971659919, |
|
"grad_norm": 0.01578591763973236, |
|
"learning_rate": 3.344155844155844e-05, |
|
"loss": 0.0064, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.02107596956193447, |
|
"learning_rate": 3.376623376623377e-05, |
|
"loss": 0.0111, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8502024291497976, |
|
"grad_norm": 0.014094969257712364, |
|
"learning_rate": 3.409090909090909e-05, |
|
"loss": 0.0043, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8582995951417004, |
|
"grad_norm": 0.01773056574165821, |
|
"learning_rate": 3.4415584415584416e-05, |
|
"loss": 0.0065, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8663967611336032, |
|
"grad_norm": 0.01486600749194622, |
|
"learning_rate": 3.474025974025974e-05, |
|
"loss": 0.0052, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8744939271255061, |
|
"grad_norm": 0.01461310125887394, |
|
"learning_rate": 3.506493506493507e-05, |
|
"loss": 0.0037, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8825910931174089, |
|
"grad_norm": 0.0219147726893425, |
|
"learning_rate": 3.5389610389610387e-05, |
|
"loss": 0.007, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8906882591093117, |
|
"grad_norm": 0.01585337519645691, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 0.004, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8987854251012146, |
|
"grad_norm": 0.01616801507771015, |
|
"learning_rate": 3.603896103896104e-05, |
|
"loss": 0.006, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.9068825910931174, |
|
"grad_norm": 0.015305282548069954, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.0045, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.9149797570850202, |
|
"grad_norm": 0.013390602543950081, |
|
"learning_rate": 3.668831168831169e-05, |
|
"loss": 0.0054, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.016158539801836014, |
|
"learning_rate": 3.701298701298702e-05, |
|
"loss": 0.0053, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.9311740890688259, |
|
"grad_norm": 0.015498949214816093, |
|
"learning_rate": 3.7337662337662335e-05, |
|
"loss": 0.0044, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.9392712550607287, |
|
"grad_norm": 0.013625388033688068, |
|
"learning_rate": 3.7662337662337665e-05, |
|
"loss": 0.0045, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 0.018029414117336273, |
|
"learning_rate": 3.798701298701299e-05, |
|
"loss": 0.0051, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.9554655870445344, |
|
"grad_norm": 0.018329549580812454, |
|
"learning_rate": 3.831168831168831e-05, |
|
"loss": 0.0078, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9635627530364372, |
|
"grad_norm": 0.015500400215387344, |
|
"learning_rate": 3.8636363636363636e-05, |
|
"loss": 0.0036, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.97165991902834, |
|
"grad_norm": 0.01624232903122902, |
|
"learning_rate": 3.8961038961038966e-05, |
|
"loss": 0.0063, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.979757085020243, |
|
"grad_norm": 0.014512493275105953, |
|
"learning_rate": 3.928571428571429e-05, |
|
"loss": 0.0042, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9878542510121457, |
|
"grad_norm": 0.018440047279000282, |
|
"learning_rate": 3.9610389610389614e-05, |
|
"loss": 0.0058, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.9959514170040485, |
|
"grad_norm": 0.011620835401117802, |
|
"learning_rate": 3.993506493506494e-05, |
|
"loss": 0.0029, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.0040485829959513, |
|
"grad_norm": 0.028106795623898506, |
|
"learning_rate": 4.025974025974026e-05, |
|
"loss": 0.012, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.0121457489878543, |
|
"grad_norm": 0.009489455260336399, |
|
"learning_rate": 4.0584415584415584e-05, |
|
"loss": 0.003, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.0121457489878543, |
|
"eval_loss": 0.0060044582933187485, |
|
"eval_runtime": 20.8715, |
|
"eval_samples_per_second": 4.791, |
|
"eval_steps_per_second": 1.198, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.0202429149797572, |
|
"grad_norm": 0.01749836467206478, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 0.0087, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.0283400809716599, |
|
"grad_norm": 0.011480778455734253, |
|
"learning_rate": 4.123376623376624e-05, |
|
"loss": 0.0035, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.0364372469635628, |
|
"grad_norm": 0.012941240333020687, |
|
"learning_rate": 4.155844155844156e-05, |
|
"loss": 0.0053, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.0445344129554657, |
|
"grad_norm": 0.012464286759495735, |
|
"learning_rate": 4.1883116883116886e-05, |
|
"loss": 0.0041, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 0.013026767410337925, |
|
"learning_rate": 4.220779220779221e-05, |
|
"loss": 0.0058, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0607287449392713, |
|
"grad_norm": 0.014864835888147354, |
|
"learning_rate": 4.253246753246753e-05, |
|
"loss": 0.0037, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.0688259109311742, |
|
"grad_norm": 0.011576451361179352, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 0.0038, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 0.016221897676587105, |
|
"learning_rate": 4.318181818181819e-05, |
|
"loss": 0.005, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.0850202429149798, |
|
"grad_norm": 0.013863411732017994, |
|
"learning_rate": 4.3506493506493503e-05, |
|
"loss": 0.0052, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.0931174089068827, |
|
"grad_norm": 0.014415189623832703, |
|
"learning_rate": 4.3831168831168834e-05, |
|
"loss": 0.0041, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.1012145748987854, |
|
"grad_norm": 0.014737873338162899, |
|
"learning_rate": 4.415584415584416e-05, |
|
"loss": 0.0042, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.1093117408906883, |
|
"grad_norm": 0.015526373870670795, |
|
"learning_rate": 4.448051948051948e-05, |
|
"loss": 0.0029, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.117408906882591, |
|
"grad_norm": 0.014790773391723633, |
|
"learning_rate": 4.4805194805194805e-05, |
|
"loss": 0.0052, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.125506072874494, |
|
"grad_norm": 0.02353314682841301, |
|
"learning_rate": 4.5129870129870135e-05, |
|
"loss": 0.0093, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.1336032388663968, |
|
"grad_norm": 0.016826335340738297, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.0052, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.1417004048582995, |
|
"grad_norm": 0.014538138173520565, |
|
"learning_rate": 4.577922077922078e-05, |
|
"loss": 0.0055, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.1497975708502024, |
|
"grad_norm": 0.016404012218117714, |
|
"learning_rate": 4.6103896103896106e-05, |
|
"loss": 0.0044, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.1578947368421053, |
|
"grad_norm": 0.014474052004516125, |
|
"learning_rate": 4.642857142857143e-05, |
|
"loss": 0.0037, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.165991902834008, |
|
"grad_norm": 0.01470700092613697, |
|
"learning_rate": 4.675324675324675e-05, |
|
"loss": 0.0069, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.174089068825911, |
|
"grad_norm": 0.01384800300002098, |
|
"learning_rate": 4.707792207792208e-05, |
|
"loss": 0.0066, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1821862348178138, |
|
"grad_norm": 0.012554049491882324, |
|
"learning_rate": 4.740259740259741e-05, |
|
"loss": 0.0044, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.1902834008097165, |
|
"grad_norm": 0.015297799371182919, |
|
"learning_rate": 4.772727272727273e-05, |
|
"loss": 0.0048, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.1983805668016194, |
|
"grad_norm": 0.013141577132046223, |
|
"learning_rate": 4.8051948051948054e-05, |
|
"loss": 0.0024, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.2064777327935223, |
|
"grad_norm": 0.013096342794597149, |
|
"learning_rate": 4.8376623376623384e-05, |
|
"loss": 0.0025, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.214574898785425, |
|
"grad_norm": 0.01221439242362976, |
|
"learning_rate": 4.87012987012987e-05, |
|
"loss": 0.0036, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.214574898785425, |
|
"eval_loss": 0.005298578180372715, |
|
"eval_runtime": 20.8777, |
|
"eval_samples_per_second": 4.79, |
|
"eval_steps_per_second": 1.197, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.222672064777328, |
|
"grad_norm": 0.017413007095456123, |
|
"learning_rate": 4.902597402597403e-05, |
|
"loss": 0.0073, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.020000923424959183, |
|
"learning_rate": 4.9350649350649355e-05, |
|
"loss": 0.0029, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.2388663967611335, |
|
"grad_norm": 0.013662380166351795, |
|
"learning_rate": 4.967532467532468e-05, |
|
"loss": 0.004, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.2469635627530364, |
|
"grad_norm": 0.013253867626190186, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0029, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.2550607287449393, |
|
"grad_norm": 0.016671188175678253, |
|
"learning_rate": 5.032467532467533e-05, |
|
"loss": 0.0034, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 0.012826805002987385, |
|
"learning_rate": 5.064935064935065e-05, |
|
"loss": 0.0026, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.271255060728745, |
|
"grad_norm": 0.016341542825102806, |
|
"learning_rate": 5.097402597402597e-05, |
|
"loss": 0.0046, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.2793522267206479, |
|
"grad_norm": 0.013105432502925396, |
|
"learning_rate": 5.1298701298701304e-05, |
|
"loss": 0.0028, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.2874493927125505, |
|
"grad_norm": 0.015593166463077068, |
|
"learning_rate": 5.162337662337663e-05, |
|
"loss": 0.0044, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.2955465587044535, |
|
"grad_norm": 0.017734261229634285, |
|
"learning_rate": 5.1948051948051944e-05, |
|
"loss": 0.0053, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3036437246963564, |
|
"grad_norm": 0.013654530048370361, |
|
"learning_rate": 5.2272727272727274e-05, |
|
"loss": 0.0036, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.311740890688259, |
|
"grad_norm": 0.01586996205151081, |
|
"learning_rate": 5.25974025974026e-05, |
|
"loss": 0.0028, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.319838056680162, |
|
"grad_norm": 0.014020202681422234, |
|
"learning_rate": 5.292207792207793e-05, |
|
"loss": 0.0033, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.3279352226720649, |
|
"grad_norm": 0.014661739580333233, |
|
"learning_rate": 5.3246753246753245e-05, |
|
"loss": 0.0022, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.3360323886639676, |
|
"grad_norm": 0.015314622782170773, |
|
"learning_rate": 5.3571428571428575e-05, |
|
"loss": 0.0047, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.3441295546558705, |
|
"grad_norm": 0.016851790249347687, |
|
"learning_rate": 5.38961038961039e-05, |
|
"loss": 0.0029, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.3522267206477734, |
|
"grad_norm": 0.01127530261874199, |
|
"learning_rate": 5.422077922077923e-05, |
|
"loss": 0.0031, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.360323886639676, |
|
"grad_norm": 0.012864851392805576, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 0.0026, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.368421052631579, |
|
"grad_norm": 0.012660622596740723, |
|
"learning_rate": 5.487012987012987e-05, |
|
"loss": 0.0033, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.376518218623482, |
|
"grad_norm": 0.020632926374673843, |
|
"learning_rate": 5.51948051948052e-05, |
|
"loss": 0.005, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 0.014771834947168827, |
|
"learning_rate": 5.5519480519480524e-05, |
|
"loss": 0.005, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.3927125506072875, |
|
"grad_norm": 0.014798545278608799, |
|
"learning_rate": 5.584415584415584e-05, |
|
"loss": 0.0018, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.4008097165991904, |
|
"grad_norm": 0.01847289875149727, |
|
"learning_rate": 5.616883116883117e-05, |
|
"loss": 0.0048, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.408906882591093, |
|
"grad_norm": 0.013270820491015911, |
|
"learning_rate": 5.64935064935065e-05, |
|
"loss": 0.0045, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.417004048582996, |
|
"grad_norm": 0.0156845785677433, |
|
"learning_rate": 5.6818181818181825e-05, |
|
"loss": 0.0052, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.417004048582996, |
|
"eval_loss": 0.004898196551948786, |
|
"eval_runtime": 20.8955, |
|
"eval_samples_per_second": 4.786, |
|
"eval_steps_per_second": 1.196, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.425101214574899, |
|
"grad_norm": 0.0185660719871521, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.003, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.4331983805668016, |
|
"grad_norm": 0.016853397712111473, |
|
"learning_rate": 5.746753246753247e-05, |
|
"loss": 0.0033, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.4412955465587045, |
|
"grad_norm": 0.012373693287372589, |
|
"learning_rate": 5.7792207792207796e-05, |
|
"loss": 0.0027, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.4493927125506074, |
|
"grad_norm": 0.02164478786289692, |
|
"learning_rate": 5.8116883116883126e-05, |
|
"loss": 0.0039, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.45748987854251, |
|
"grad_norm": 0.019912002608180046, |
|
"learning_rate": 5.844155844155844e-05, |
|
"loss": 0.0041, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.465587044534413, |
|
"grad_norm": 0.011308755725622177, |
|
"learning_rate": 5.8766233766233766e-05, |
|
"loss": 0.0024, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.4736842105263157, |
|
"grad_norm": 0.014568260870873928, |
|
"learning_rate": 5.90909090909091e-05, |
|
"loss": 0.0016, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.4817813765182186, |
|
"grad_norm": 0.011573289521038532, |
|
"learning_rate": 5.9415584415584414e-05, |
|
"loss": 0.0022, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.4898785425101215, |
|
"grad_norm": 0.014651118777692318, |
|
"learning_rate": 5.9740259740259744e-05, |
|
"loss": 0.0051, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.4979757085020242, |
|
"grad_norm": 0.014680047519505024, |
|
"learning_rate": 6.006493506493507e-05, |
|
"loss": 0.0024, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.5060728744939271, |
|
"grad_norm": 0.015858447179198265, |
|
"learning_rate": 6.03896103896104e-05, |
|
"loss": 0.0038, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.5141700404858298, |
|
"grad_norm": 0.015239309519529343, |
|
"learning_rate": 6.0714285714285715e-05, |
|
"loss": 0.0036, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.522267206477733, |
|
"grad_norm": 0.01742137223482132, |
|
"learning_rate": 6.103896103896104e-05, |
|
"loss": 0.0034, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.5303643724696356, |
|
"grad_norm": 0.01396004669368267, |
|
"learning_rate": 6.136363636363636e-05, |
|
"loss": 0.0028, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.01871366612613201, |
|
"learning_rate": 6.16883116883117e-05, |
|
"loss": 0.0054, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.5465587044534415, |
|
"grad_norm": 0.01240773219615221, |
|
"learning_rate": 6.201298701298701e-05, |
|
"loss": 0.0021, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.5546558704453441, |
|
"grad_norm": 0.019545145332813263, |
|
"learning_rate": 6.233766233766233e-05, |
|
"loss": 0.0044, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.5627530364372468, |
|
"grad_norm": 0.011620803736150265, |
|
"learning_rate": 6.266233766233767e-05, |
|
"loss": 0.0034, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.5708502024291497, |
|
"grad_norm": 0.018584923818707466, |
|
"learning_rate": 6.2987012987013e-05, |
|
"loss": 0.0065, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 0.01458702515810728, |
|
"learning_rate": 6.331168831168832e-05, |
|
"loss": 0.0033, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.5870445344129553, |
|
"grad_norm": 0.01610160432755947, |
|
"learning_rate": 6.363636363636364e-05, |
|
"loss": 0.0045, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.5951417004048583, |
|
"grad_norm": 0.017108574509620667, |
|
"learning_rate": 6.396103896103896e-05, |
|
"loss": 0.0032, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.6032388663967612, |
|
"grad_norm": 0.013298330828547478, |
|
"learning_rate": 6.428571428571429e-05, |
|
"loss": 0.0026, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.6113360323886639, |
|
"grad_norm": 0.013509229756891727, |
|
"learning_rate": 6.461038961038961e-05, |
|
"loss": 0.0018, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.6194331983805668, |
|
"grad_norm": 0.01902744546532631, |
|
"learning_rate": 6.493506493506494e-05, |
|
"loss": 0.0029, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6194331983805668, |
|
"eval_loss": 0.004173097666352987, |
|
"eval_runtime": 20.8941, |
|
"eval_samples_per_second": 4.786, |
|
"eval_steps_per_second": 1.197, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6275303643724697, |
|
"grad_norm": 0.015973802655935287, |
|
"learning_rate": 6.525974025974026e-05, |
|
"loss": 0.0028, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.6356275303643724, |
|
"grad_norm": 0.018992941826581955, |
|
"learning_rate": 6.55844155844156e-05, |
|
"loss": 0.0052, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.6437246963562753, |
|
"grad_norm": 0.014920739457011223, |
|
"learning_rate": 6.59090909090909e-05, |
|
"loss": 0.0039, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.6518218623481782, |
|
"grad_norm": 0.015221747569739819, |
|
"learning_rate": 6.623376623376624e-05, |
|
"loss": 0.0026, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.6599190283400809, |
|
"grad_norm": 0.01537750568240881, |
|
"learning_rate": 6.655844155844157e-05, |
|
"loss": 0.0062, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.6680161943319838, |
|
"grad_norm": 0.011275989934802055, |
|
"learning_rate": 6.688311688311688e-05, |
|
"loss": 0.0027, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.6761133603238867, |
|
"grad_norm": 0.017085865139961243, |
|
"learning_rate": 6.720779220779221e-05, |
|
"loss": 0.0029, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 0.012188843451440334, |
|
"learning_rate": 6.753246753246754e-05, |
|
"loss": 0.0033, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 0.009229995310306549, |
|
"learning_rate": 6.785714285714286e-05, |
|
"loss": 0.0023, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.7004048582995952, |
|
"grad_norm": 0.013247930444777012, |
|
"learning_rate": 6.818181818181818e-05, |
|
"loss": 0.0025, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.708502024291498, |
|
"grad_norm": 0.017777971923351288, |
|
"learning_rate": 6.850649350649351e-05, |
|
"loss": 0.0059, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.7165991902834008, |
|
"grad_norm": 0.011387546546757221, |
|
"learning_rate": 6.883116883116883e-05, |
|
"loss": 0.0024, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.7246963562753037, |
|
"grad_norm": 0.013648373074829578, |
|
"learning_rate": 6.915584415584417e-05, |
|
"loss": 0.0027, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.7327935222672064, |
|
"grad_norm": 0.012230796739459038, |
|
"learning_rate": 6.948051948051948e-05, |
|
"loss": 0.0032, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.7408906882591093, |
|
"grad_norm": 0.00890358630567789, |
|
"learning_rate": 6.98051948051948e-05, |
|
"loss": 0.0017, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.7489878542510122, |
|
"grad_norm": 0.019259551540017128, |
|
"learning_rate": 7.012987012987014e-05, |
|
"loss": 0.0037, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.757085020242915, |
|
"grad_norm": 0.01166984811425209, |
|
"learning_rate": 7.045454545454546e-05, |
|
"loss": 0.0029, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.7651821862348178, |
|
"grad_norm": 0.014050965197384357, |
|
"learning_rate": 7.077922077922077e-05, |
|
"loss": 0.0019, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.7732793522267207, |
|
"grad_norm": 0.012960278429090977, |
|
"learning_rate": 7.110389610389611e-05, |
|
"loss": 0.0019, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.7813765182186234, |
|
"grad_norm": 0.01847727596759796, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.0042, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.7894736842105263, |
|
"grad_norm": 0.012762092985212803, |
|
"learning_rate": 7.175324675324676e-05, |
|
"loss": 0.0013, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.7975708502024292, |
|
"grad_norm": 0.014623441733419895, |
|
"learning_rate": 7.207792207792208e-05, |
|
"loss": 0.0039, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.805668016194332, |
|
"grad_norm": 0.017683332785964012, |
|
"learning_rate": 7.24025974025974e-05, |
|
"loss": 0.0043, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.8137651821862348, |
|
"grad_norm": 0.017056427896022797, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.0036, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.8218623481781377, |
|
"grad_norm": 0.010228103026747704, |
|
"learning_rate": 7.305194805194807e-05, |
|
"loss": 0.0017, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.8218623481781377, |
|
"eval_loss": 0.0039197178557515144, |
|
"eval_runtime": 20.8731, |
|
"eval_samples_per_second": 4.791, |
|
"eval_steps_per_second": 1.198, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.8299595141700404, |
|
"grad_norm": 0.016191432252526283, |
|
"learning_rate": 7.337662337662338e-05, |
|
"loss": 0.0057, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.8380566801619433, |
|
"grad_norm": 0.010266617871820927, |
|
"learning_rate": 7.37012987012987e-05, |
|
"loss": 0.0021, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.016297219321131706, |
|
"learning_rate": 7.402597402597404e-05, |
|
"loss": 0.0038, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.854251012145749, |
|
"grad_norm": 0.011634326539933681, |
|
"learning_rate": 7.435064935064936e-05, |
|
"loss": 0.0033, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.8623481781376519, |
|
"grad_norm": 0.01735992170870304, |
|
"learning_rate": 7.467532467532467e-05, |
|
"loss": 0.0045, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.8704453441295548, |
|
"grad_norm": 0.012468023225665092, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0039, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.8785425101214575, |
|
"grad_norm": 0.01030401885509491, |
|
"learning_rate": 7.532467532467533e-05, |
|
"loss": 0.0033, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.8866396761133604, |
|
"grad_norm": 0.008860866539180279, |
|
"learning_rate": 7.564935064935065e-05, |
|
"loss": 0.0025, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.8947368421052633, |
|
"grad_norm": 0.014425918459892273, |
|
"learning_rate": 7.597402597402598e-05, |
|
"loss": 0.0054, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.902834008097166, |
|
"grad_norm": 0.012539315037429333, |
|
"learning_rate": 7.62987012987013e-05, |
|
"loss": 0.0044, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.9109311740890689, |
|
"grad_norm": 0.0120421526953578, |
|
"learning_rate": 7.662337662337662e-05, |
|
"loss": 0.0048, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.9190283400809718, |
|
"grad_norm": 0.011059713549911976, |
|
"learning_rate": 7.694805194805195e-05, |
|
"loss": 0.0024, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.9271255060728745, |
|
"grad_norm": 0.01062751654535532, |
|
"learning_rate": 7.727272727272727e-05, |
|
"loss": 0.0025, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.9352226720647774, |
|
"grad_norm": 0.009996469132602215, |
|
"learning_rate": 7.75974025974026e-05, |
|
"loss": 0.0022, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.9433198380566803, |
|
"grad_norm": 0.014030283316969872, |
|
"learning_rate": 7.792207792207793e-05, |
|
"loss": 0.0027, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.951417004048583, |
|
"grad_norm": 0.011797044426202774, |
|
"learning_rate": 7.824675324675324e-05, |
|
"loss": 0.0039, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.9595141700404857, |
|
"grad_norm": 0.014973408542573452, |
|
"learning_rate": 7.857142857142858e-05, |
|
"loss": 0.0039, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.9676113360323888, |
|
"grad_norm": 0.01119126658886671, |
|
"learning_rate": 7.88961038961039e-05, |
|
"loss": 0.0021, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.9757085020242915, |
|
"grad_norm": 0.012466533109545708, |
|
"learning_rate": 7.922077922077923e-05, |
|
"loss": 0.0024, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.9838056680161942, |
|
"grad_norm": 0.01311230007559061, |
|
"learning_rate": 7.954545454545455e-05, |
|
"loss": 0.0037, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.9919028340080973, |
|
"grad_norm": 0.01020133588463068, |
|
"learning_rate": 7.987012987012987e-05, |
|
"loss": 0.0027, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.01899588108062744, |
|
"learning_rate": 8.01948051948052e-05, |
|
"loss": 0.0032, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.0080971659919027, |
|
"grad_norm": 0.011334598064422607, |
|
"learning_rate": 8.051948051948052e-05, |
|
"loss": 0.002, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.016194331983806, |
|
"grad_norm": 0.011807809583842754, |
|
"learning_rate": 8.084415584415585e-05, |
|
"loss": 0.002, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.0242914979757085, |
|
"grad_norm": 0.010670343413949013, |
|
"learning_rate": 8.116883116883117e-05, |
|
"loss": 0.0022, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.0242914979757085, |
|
"eval_loss": 0.0035137999802827835, |
|
"eval_runtime": 20.89, |
|
"eval_samples_per_second": 4.787, |
|
"eval_steps_per_second": 1.197, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.032388663967611, |
|
"grad_norm": 0.011268955655395985, |
|
"learning_rate": 8.14935064935065e-05, |
|
"loss": 0.0016, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.0404858299595143, |
|
"grad_norm": 0.013640797697007656, |
|
"learning_rate": 8.181818181818183e-05, |
|
"loss": 0.0029, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.048582995951417, |
|
"grad_norm": 0.008933900855481625, |
|
"learning_rate": 8.214285714285714e-05, |
|
"loss": 0.0021, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.0566801619433197, |
|
"grad_norm": 0.012379194609820843, |
|
"learning_rate": 8.246753246753248e-05, |
|
"loss": 0.0026, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.064777327935223, |
|
"grad_norm": 0.015894446521997452, |
|
"learning_rate": 8.27922077922078e-05, |
|
"loss": 0.002, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.0728744939271255, |
|
"grad_norm": 0.013013158924877644, |
|
"learning_rate": 8.311688311688312e-05, |
|
"loss": 0.002, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.080971659919028, |
|
"grad_norm": 0.00733231520280242, |
|
"learning_rate": 8.344155844155845e-05, |
|
"loss": 0.0012, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.0890688259109313, |
|
"grad_norm": 0.011731351725757122, |
|
"learning_rate": 8.376623376623377e-05, |
|
"loss": 0.0028, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.097165991902834, |
|
"grad_norm": 0.010311335325241089, |
|
"learning_rate": 8.40909090909091e-05, |
|
"loss": 0.0017, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 0.010359687730669975, |
|
"learning_rate": 8.441558441558442e-05, |
|
"loss": 0.0033, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.11336032388664, |
|
"grad_norm": 0.017441799864172935, |
|
"learning_rate": 8.474025974025974e-05, |
|
"loss": 0.0032, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.1214574898785425, |
|
"grad_norm": 0.014498945325613022, |
|
"learning_rate": 8.506493506493507e-05, |
|
"loss": 0.0048, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.1295546558704452, |
|
"grad_norm": 0.011659245006740093, |
|
"learning_rate": 8.53896103896104e-05, |
|
"loss": 0.0021, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.1376518218623484, |
|
"grad_norm": 0.01307612657546997, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.0024, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.145748987854251, |
|
"grad_norm": 0.007933567278087139, |
|
"learning_rate": 8.603896103896104e-05, |
|
"loss": 0.0017, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.010644306428730488, |
|
"learning_rate": 8.636363636363637e-05, |
|
"loss": 0.0022, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.161943319838057, |
|
"grad_norm": 0.017315855249762535, |
|
"learning_rate": 8.66883116883117e-05, |
|
"loss": 0.0013, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.1700404858299596, |
|
"grad_norm": 0.013733215630054474, |
|
"learning_rate": 8.701298701298701e-05, |
|
"loss": 0.0019, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.1781376518218623, |
|
"grad_norm": 0.019037563353776932, |
|
"learning_rate": 8.733766233766234e-05, |
|
"loss": 0.0044, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.1862348178137654, |
|
"grad_norm": 0.014429651200771332, |
|
"learning_rate": 8.766233766233767e-05, |
|
"loss": 0.003, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.194331983805668, |
|
"grad_norm": 0.013059676624834538, |
|
"learning_rate": 8.798701298701299e-05, |
|
"loss": 0.0022, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.2024291497975708, |
|
"grad_norm": 0.011815879493951797, |
|
"learning_rate": 8.831168831168831e-05, |
|
"loss": 0.0016, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.2105263157894735, |
|
"grad_norm": 0.010117270052433014, |
|
"learning_rate": 8.863636363636364e-05, |
|
"loss": 0.0018, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.2186234817813766, |
|
"grad_norm": 0.012147231958806515, |
|
"learning_rate": 8.896103896103896e-05, |
|
"loss": 0.0028, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.2267206477732793, |
|
"grad_norm": 0.013275344856083393, |
|
"learning_rate": 8.92857142857143e-05, |
|
"loss": 0.0027, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.2267206477732793, |
|
"eval_loss": 0.003232660237699747, |
|
"eval_runtime": 20.8846, |
|
"eval_samples_per_second": 4.788, |
|
"eval_steps_per_second": 1.197, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.234817813765182, |
|
"grad_norm": 0.009600469842553139, |
|
"learning_rate": 8.961038961038961e-05, |
|
"loss": 0.0018, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.242914979757085, |
|
"grad_norm": 0.013018307276070118, |
|
"learning_rate": 8.993506493506493e-05, |
|
"loss": 0.0013, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.251012145748988, |
|
"grad_norm": 0.013700856827199459, |
|
"learning_rate": 9.025974025974027e-05, |
|
"loss": 0.0026, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.2591093117408905, |
|
"grad_norm": 0.012119555845856667, |
|
"learning_rate": 9.05844155844156e-05, |
|
"loss": 0.0014, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.2672064777327936, |
|
"grad_norm": 0.01446222048252821, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.0016, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.2753036437246963, |
|
"grad_norm": 0.008024114184081554, |
|
"learning_rate": 9.123376623376624e-05, |
|
"loss": 0.0014, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.283400809716599, |
|
"grad_norm": 0.015081741847097874, |
|
"learning_rate": 9.155844155844156e-05, |
|
"loss": 0.0027, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.291497975708502, |
|
"grad_norm": 0.012656310573220253, |
|
"learning_rate": 9.188311688311689e-05, |
|
"loss": 0.0019, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.299595141700405, |
|
"grad_norm": 0.016468411311507225, |
|
"learning_rate": 9.220779220779221e-05, |
|
"loss": 0.0025, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 0.0127490209415555, |
|
"learning_rate": 9.253246753246754e-05, |
|
"loss": 0.0019, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.3157894736842106, |
|
"grad_norm": 0.01162753626704216, |
|
"learning_rate": 9.285714285714286e-05, |
|
"loss": 0.0017, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.3238866396761133, |
|
"grad_norm": 0.01099877618253231, |
|
"learning_rate": 9.318181818181818e-05, |
|
"loss": 0.0016, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.331983805668016, |
|
"grad_norm": 0.009699794463813305, |
|
"learning_rate": 9.35064935064935e-05, |
|
"loss": 0.0018, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.340080971659919, |
|
"grad_norm": 0.011390355415642262, |
|
"learning_rate": 9.383116883116884e-05, |
|
"loss": 0.0024, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.348178137651822, |
|
"grad_norm": 0.01110926829278469, |
|
"learning_rate": 9.415584415584417e-05, |
|
"loss": 0.0016, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.3562753036437245, |
|
"grad_norm": 0.012503352016210556, |
|
"learning_rate": 9.448051948051948e-05, |
|
"loss": 0.0018, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.3643724696356276, |
|
"grad_norm": 0.01324179582297802, |
|
"learning_rate": 9.480519480519481e-05, |
|
"loss": 0.0025, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.3724696356275303, |
|
"grad_norm": 0.010324635542929173, |
|
"learning_rate": 9.512987012987014e-05, |
|
"loss": 0.0018, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.380566801619433, |
|
"grad_norm": 0.010333823971450329, |
|
"learning_rate": 9.545454545454546e-05, |
|
"loss": 0.0012, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.388663967611336, |
|
"grad_norm": 0.011566666886210442, |
|
"learning_rate": 9.577922077922078e-05, |
|
"loss": 0.0023, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.396761133603239, |
|
"grad_norm": 0.008786414749920368, |
|
"learning_rate": 9.610389610389611e-05, |
|
"loss": 0.0016, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.4048582995951415, |
|
"grad_norm": 0.011586328037083149, |
|
"learning_rate": 9.642857142857143e-05, |
|
"loss": 0.0023, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.4129554655870447, |
|
"grad_norm": 0.014018291607499123, |
|
"learning_rate": 9.675324675324677e-05, |
|
"loss": 0.0019, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.4210526315789473, |
|
"grad_norm": 0.008588538505136967, |
|
"learning_rate": 9.707792207792208e-05, |
|
"loss": 0.0009, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.42914979757085, |
|
"grad_norm": 0.009654571302235126, |
|
"learning_rate": 9.74025974025974e-05, |
|
"loss": 0.0017, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.42914979757085, |
|
"eval_loss": 0.003001517616212368, |
|
"eval_runtime": 20.9488, |
|
"eval_samples_per_second": 4.774, |
|
"eval_steps_per_second": 1.193, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.437246963562753, |
|
"grad_norm": 0.01593548245728016, |
|
"learning_rate": 9.772727272727274e-05, |
|
"loss": 0.0027, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.445344129554656, |
|
"grad_norm": 0.01584690809249878, |
|
"learning_rate": 9.805194805194806e-05, |
|
"loss": 0.0031, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.4534412955465585, |
|
"grad_norm": 0.01336862612515688, |
|
"learning_rate": 9.837662337662337e-05, |
|
"loss": 0.0032, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.009371182881295681, |
|
"learning_rate": 9.870129870129871e-05, |
|
"loss": 0.0015, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.4696356275303644, |
|
"grad_norm": 0.012227087281644344, |
|
"learning_rate": 9.902597402597403e-05, |
|
"loss": 0.0015, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.477732793522267, |
|
"grad_norm": 0.009863483719527721, |
|
"learning_rate": 9.935064935064936e-05, |
|
"loss": 0.0021, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.48582995951417, |
|
"grad_norm": 0.013306519947946072, |
|
"learning_rate": 9.967532467532468e-05, |
|
"loss": 0.0032, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.493927125506073, |
|
"grad_norm": 0.009393845684826374, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0016, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.5020242914979756, |
|
"grad_norm": 0.009558003395795822, |
|
"learning_rate": 9.999996777288795e-05, |
|
"loss": 0.0021, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.5101214574898787, |
|
"grad_norm": 0.01038302294909954, |
|
"learning_rate": 9.999987109159334e-05, |
|
"loss": 0.003, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.5182186234817814, |
|
"grad_norm": 0.011483744718134403, |
|
"learning_rate": 9.999970995624077e-05, |
|
"loss": 0.0026, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.526315789473684, |
|
"grad_norm": 0.012138905003666878, |
|
"learning_rate": 9.9999484367038e-05, |
|
"loss": 0.0018, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.534412955465587, |
|
"grad_norm": 0.008925210684537888, |
|
"learning_rate": 9.999919432427583e-05, |
|
"loss": 0.0012, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.54251012145749, |
|
"grad_norm": 0.0103675602003932, |
|
"learning_rate": 9.999883982832811e-05, |
|
"loss": 0.0015, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.5506072874493926, |
|
"grad_norm": 0.010114219971001148, |
|
"learning_rate": 9.999842087965185e-05, |
|
"loss": 0.0027, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.5587044534412957, |
|
"grad_norm": 0.011849566362798214, |
|
"learning_rate": 9.999793747878712e-05, |
|
"loss": 0.0035, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.5668016194331984, |
|
"grad_norm": 0.022183779627084732, |
|
"learning_rate": 9.999738962635703e-05, |
|
"loss": 0.0022, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.574898785425101, |
|
"grad_norm": 0.01708986796438694, |
|
"learning_rate": 9.999677732306782e-05, |
|
"loss": 0.0021, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.582995951417004, |
|
"grad_norm": 0.012563329190015793, |
|
"learning_rate": 9.999610056970881e-05, |
|
"loss": 0.0015, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.591093117408907, |
|
"grad_norm": 0.018579822033643723, |
|
"learning_rate": 9.999535936715239e-05, |
|
"loss": 0.0035, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.5991902834008096, |
|
"grad_norm": 0.014235563576221466, |
|
"learning_rate": 9.999455371635402e-05, |
|
"loss": 0.0022, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.6072874493927127, |
|
"grad_norm": 0.013800045475363731, |
|
"learning_rate": 9.999368361835226e-05, |
|
"loss": 0.0034, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 0.010226680897176266, |
|
"learning_rate": 9.999274907426876e-05, |
|
"loss": 0.0015, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.623481781376518, |
|
"grad_norm": 0.0106669832020998, |
|
"learning_rate": 9.99917500853082e-05, |
|
"loss": 0.0025, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 0.007497102487832308, |
|
"learning_rate": 9.999068665275834e-05, |
|
"loss": 0.0012, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"eval_loss": 0.002751573920249939, |
|
"eval_runtime": 20.8798, |
|
"eval_samples_per_second": 4.789, |
|
"eval_steps_per_second": 1.197, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.639676113360324, |
|
"grad_norm": 0.015462521463632584, |
|
"learning_rate": 9.99895587779901e-05, |
|
"loss": 0.0026, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.6477732793522266, |
|
"grad_norm": 0.008158071897923946, |
|
"learning_rate": 9.998836646245735e-05, |
|
"loss": 0.0011, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.6558704453441297, |
|
"grad_norm": 0.010231217369437218, |
|
"learning_rate": 9.998710970769711e-05, |
|
"loss": 0.0025, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.6639676113360324, |
|
"grad_norm": 0.011228160932660103, |
|
"learning_rate": 9.998578851532945e-05, |
|
"loss": 0.0022, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.672064777327935, |
|
"grad_norm": 0.010875989682972431, |
|
"learning_rate": 9.998440288705747e-05, |
|
"loss": 0.0028, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.6801619433198383, |
|
"grad_norm": 0.009110379964113235, |
|
"learning_rate": 9.998295282466738e-05, |
|
"loss": 0.0015, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.688259109311741, |
|
"grad_norm": 0.009458299726247787, |
|
"learning_rate": 9.998143833002845e-05, |
|
"loss": 0.0016, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.6963562753036436, |
|
"grad_norm": 0.00835677981376648, |
|
"learning_rate": 9.997985940509295e-05, |
|
"loss": 0.0013, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.7044534412955468, |
|
"grad_norm": 0.009965039789676666, |
|
"learning_rate": 9.997821605189627e-05, |
|
"loss": 0.0022, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.7125506072874495, |
|
"grad_norm": 0.010593763552606106, |
|
"learning_rate": 9.997650827255685e-05, |
|
"loss": 0.0015, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.720647773279352, |
|
"grad_norm": 0.010097038000822067, |
|
"learning_rate": 9.997473606927612e-05, |
|
"loss": 0.0015, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.7287449392712553, |
|
"grad_norm": 0.010393706150352955, |
|
"learning_rate": 9.997289944433864e-05, |
|
"loss": 0.0022, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.736842105263158, |
|
"grad_norm": 0.01462769228965044, |
|
"learning_rate": 9.997099840011195e-05, |
|
"loss": 0.0039, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.7449392712550607, |
|
"grad_norm": 0.010113600641489029, |
|
"learning_rate": 9.996903293904666e-05, |
|
"loss": 0.0018, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.753036437246964, |
|
"grad_norm": 0.011077907867729664, |
|
"learning_rate": 9.996700306367643e-05, |
|
"loss": 0.0009, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.7611336032388665, |
|
"grad_norm": 0.00902781542390585, |
|
"learning_rate": 9.996490877661793e-05, |
|
"loss": 0.0016, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 0.014123060740530491, |
|
"learning_rate": 9.996275008057087e-05, |
|
"loss": 0.0027, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.7773279352226723, |
|
"grad_norm": 0.014702217653393745, |
|
"learning_rate": 9.9960526978318e-05, |
|
"loss": 0.0026, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.785425101214575, |
|
"grad_norm": 0.007217355538159609, |
|
"learning_rate": 9.995823947272506e-05, |
|
"loss": 0.0009, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.7935222672064777, |
|
"grad_norm": 0.013469511643052101, |
|
"learning_rate": 9.995588756674088e-05, |
|
"loss": 0.0027, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.801619433198381, |
|
"grad_norm": 0.012271186336874962, |
|
"learning_rate": 9.995347126339725e-05, |
|
"loss": 0.0013, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.8097165991902835, |
|
"grad_norm": 0.012494235299527645, |
|
"learning_rate": 9.995099056580896e-05, |
|
"loss": 0.0018, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.817813765182186, |
|
"grad_norm": 0.008622893132269382, |
|
"learning_rate": 9.994844547717388e-05, |
|
"loss": 0.0017, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.8259109311740893, |
|
"grad_norm": 0.011038469150662422, |
|
"learning_rate": 9.994583600077283e-05, |
|
"loss": 0.0017, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.834008097165992, |
|
"grad_norm": 0.010193824768066406, |
|
"learning_rate": 9.994316213996964e-05, |
|
"loss": 0.002, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.834008097165992, |
|
"eval_loss": 0.0024985964410007, |
|
"eval_runtime": 20.8778, |
|
"eval_samples_per_second": 4.79, |
|
"eval_steps_per_second": 1.197, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.8421052631578947, |
|
"grad_norm": 0.008073719218373299, |
|
"learning_rate": 9.994042389821114e-05, |
|
"loss": 0.0013, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.850202429149798, |
|
"grad_norm": 0.007987983524799347, |
|
"learning_rate": 9.993762127902717e-05, |
|
"loss": 0.0012, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.8582995951417005, |
|
"grad_norm": 0.008735005743801594, |
|
"learning_rate": 9.993475428603052e-05, |
|
"loss": 0.0013, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.866396761133603, |
|
"grad_norm": 0.009095696732401848, |
|
"learning_rate": 9.9931822922917e-05, |
|
"loss": 0.0016, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.8744939271255063, |
|
"grad_norm": 0.011008110828697681, |
|
"learning_rate": 9.992882719346539e-05, |
|
"loss": 0.0021, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.882591093117409, |
|
"grad_norm": 0.009820708073675632, |
|
"learning_rate": 9.992576710153743e-05, |
|
"loss": 0.0024, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.8906882591093117, |
|
"grad_norm": 0.013662457466125488, |
|
"learning_rate": 9.992264265107784e-05, |
|
"loss": 0.0013, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.898785425101215, |
|
"grad_norm": 0.009639346040785313, |
|
"learning_rate": 9.991945384611431e-05, |
|
"loss": 0.0018, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.9068825910931175, |
|
"grad_norm": 0.008260666392743587, |
|
"learning_rate": 9.991620069075745e-05, |
|
"loss": 0.0011, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.91497975708502, |
|
"grad_norm": 0.015122090466320515, |
|
"learning_rate": 9.991288318920089e-05, |
|
"loss": 0.0012, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 0.011863719671964645, |
|
"learning_rate": 9.990950134572113e-05, |
|
"loss": 0.0015, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.931174089068826, |
|
"grad_norm": 0.013348613865673542, |
|
"learning_rate": 9.990605516467769e-05, |
|
"loss": 0.0022, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.9392712550607287, |
|
"grad_norm": 0.01165574137121439, |
|
"learning_rate": 9.990254465051297e-05, |
|
"loss": 0.0023, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"grad_norm": 0.010028968565165997, |
|
"learning_rate": 9.98989698077523e-05, |
|
"loss": 0.0022, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.9554655870445345, |
|
"grad_norm": 0.008236058987677097, |
|
"learning_rate": 9.9895330641004e-05, |
|
"loss": 0.0012, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.9635627530364372, |
|
"grad_norm": 0.012825064361095428, |
|
"learning_rate": 9.989162715495923e-05, |
|
"loss": 0.0021, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.97165991902834, |
|
"grad_norm": 0.014107435010373592, |
|
"learning_rate": 9.98878593543921e-05, |
|
"loss": 0.0036, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.979757085020243, |
|
"grad_norm": 0.014563079923391342, |
|
"learning_rate": 9.988402724415964e-05, |
|
"loss": 0.0025, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.9878542510121457, |
|
"grad_norm": 0.011997046880424023, |
|
"learning_rate": 9.988013082920173e-05, |
|
"loss": 0.0035, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.9959514170040484, |
|
"grad_norm": 0.006900448817759752, |
|
"learning_rate": 9.987617011454122e-05, |
|
"loss": 0.0013, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.0040485829959516, |
|
"grad_norm": 0.015069114975631237, |
|
"learning_rate": 9.987214510528378e-05, |
|
"loss": 0.0025, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.0121457489878543, |
|
"grad_norm": 0.007571605499833822, |
|
"learning_rate": 9.9868055806618e-05, |
|
"loss": 0.0015, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.020242914979757, |
|
"grad_norm": 0.008344545029103756, |
|
"learning_rate": 9.98639022238153e-05, |
|
"loss": 0.0017, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.02834008097166, |
|
"grad_norm": 0.011865397915244102, |
|
"learning_rate": 9.985968436223005e-05, |
|
"loss": 0.0021, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.0364372469635628, |
|
"grad_norm": 0.006094765849411488, |
|
"learning_rate": 9.985540222729939e-05, |
|
"loss": 0.0008, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.0364372469635628, |
|
"eval_loss": 0.00258046155795455, |
|
"eval_runtime": 20.9003, |
|
"eval_samples_per_second": 4.785, |
|
"eval_steps_per_second": 1.196, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.0445344129554655, |
|
"grad_norm": 0.009989949874579906, |
|
"learning_rate": 9.985105582454336e-05, |
|
"loss": 0.0013, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.0526315789473686, |
|
"grad_norm": 0.010446115396916866, |
|
"learning_rate": 9.984664515956486e-05, |
|
"loss": 0.0014, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.0607287449392713, |
|
"grad_norm": 0.018476417288184166, |
|
"learning_rate": 9.984217023804958e-05, |
|
"loss": 0.0021, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 3.068825910931174, |
|
"grad_norm": 0.01866602897644043, |
|
"learning_rate": 9.983763106576612e-05, |
|
"loss": 0.0032, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 0.014463922940194607, |
|
"learning_rate": 9.983302764856579e-05, |
|
"loss": 0.0011, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.08502024291498, |
|
"grad_norm": 0.010341525077819824, |
|
"learning_rate": 9.982835999238285e-05, |
|
"loss": 0.0012, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 3.0931174089068825, |
|
"grad_norm": 0.00995098240673542, |
|
"learning_rate": 9.982362810323424e-05, |
|
"loss": 0.0012, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 3.1012145748987856, |
|
"grad_norm": 0.009842706844210625, |
|
"learning_rate": 9.981883198721981e-05, |
|
"loss": 0.0008, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 3.1093117408906883, |
|
"grad_norm": 0.013185705058276653, |
|
"learning_rate": 9.981397165052215e-05, |
|
"loss": 0.0023, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 3.117408906882591, |
|
"grad_norm": 0.009965223260223866, |
|
"learning_rate": 9.980904709940666e-05, |
|
"loss": 0.0012, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.125506072874494, |
|
"grad_norm": 0.011264238506555557, |
|
"learning_rate": 9.980405834022146e-05, |
|
"loss": 0.0018, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 3.133603238866397, |
|
"grad_norm": 0.006440012715756893, |
|
"learning_rate": 9.97990053793975e-05, |
|
"loss": 0.0007, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.1417004048582995, |
|
"grad_norm": 0.016382023692131042, |
|
"learning_rate": 9.979388822344848e-05, |
|
"loss": 0.0014, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.1497975708502026, |
|
"grad_norm": 0.008492662571370602, |
|
"learning_rate": 9.978870687897086e-05, |
|
"loss": 0.0012, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"grad_norm": 0.008105689659714699, |
|
"learning_rate": 9.978346135264381e-05, |
|
"loss": 0.0016, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.165991902834008, |
|
"grad_norm": 0.007508544716984034, |
|
"learning_rate": 9.977815165122926e-05, |
|
"loss": 0.001, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.174089068825911, |
|
"grad_norm": 0.00950626190751791, |
|
"learning_rate": 9.977277778157186e-05, |
|
"loss": 0.002, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.182186234817814, |
|
"grad_norm": 0.009004231542348862, |
|
"learning_rate": 9.976733975059899e-05, |
|
"loss": 0.0015, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.1902834008097165, |
|
"grad_norm": 0.0037491165567189455, |
|
"learning_rate": 9.976183756532072e-05, |
|
"loss": 0.0004, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.1983805668016196, |
|
"grad_norm": 0.01322921272367239, |
|
"learning_rate": 9.975627123282985e-05, |
|
"loss": 0.0017, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.2064777327935223, |
|
"grad_norm": 0.006518376059830189, |
|
"learning_rate": 9.975064076030184e-05, |
|
"loss": 0.0008, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.214574898785425, |
|
"grad_norm": 0.011334234848618507, |
|
"learning_rate": 9.974494615499487e-05, |
|
"loss": 0.0017, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.2226720647773277, |
|
"grad_norm": 0.007272353395819664, |
|
"learning_rate": 9.973918742424972e-05, |
|
"loss": 0.001, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"grad_norm": 0.009874061681330204, |
|
"learning_rate": 9.973336457548992e-05, |
|
"loss": 0.0015, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 3.2388663967611335, |
|
"grad_norm": 0.007885873317718506, |
|
"learning_rate": 9.972747761622159e-05, |
|
"loss": 0.0012, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.2388663967611335, |
|
"eval_loss": 0.0024508482310920954, |
|
"eval_runtime": 20.9363, |
|
"eval_samples_per_second": 4.776, |
|
"eval_steps_per_second": 1.194, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.246963562753036, |
|
"grad_norm": 0.007821009494364262, |
|
"learning_rate": 9.972152655403353e-05, |
|
"loss": 0.001, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.2550607287449393, |
|
"grad_norm": 0.014811043627560139, |
|
"learning_rate": 9.971551139659716e-05, |
|
"loss": 0.0024, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.263157894736842, |
|
"grad_norm": 0.009398553520441055, |
|
"learning_rate": 9.970943215166652e-05, |
|
"loss": 0.0014, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.2712550607287447, |
|
"grad_norm": 0.007041712291538715, |
|
"learning_rate": 9.970328882707829e-05, |
|
"loss": 0.0012, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.279352226720648, |
|
"grad_norm": 0.0057219392620027065, |
|
"learning_rate": 9.969708143075171e-05, |
|
"loss": 0.0005, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.2874493927125505, |
|
"grad_norm": 0.01213089469820261, |
|
"learning_rate": 9.969080997068865e-05, |
|
"loss": 0.0018, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.2955465587044532, |
|
"grad_norm": 0.007616452407091856, |
|
"learning_rate": 9.968447445497356e-05, |
|
"loss": 0.0009, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.3036437246963564, |
|
"grad_norm": 0.006538788788020611, |
|
"learning_rate": 9.967807489177344e-05, |
|
"loss": 0.0011, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.311740890688259, |
|
"grad_norm": 0.01062245387583971, |
|
"learning_rate": 9.967161128933788e-05, |
|
"loss": 0.0014, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.3198380566801617, |
|
"grad_norm": 0.011872652918100357, |
|
"learning_rate": 9.966508365599899e-05, |
|
"loss": 0.0016, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.327935222672065, |
|
"grad_norm": 0.012371920980513096, |
|
"learning_rate": 9.965849200017145e-05, |
|
"loss": 0.0021, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.3360323886639676, |
|
"grad_norm": 0.009583608247339725, |
|
"learning_rate": 9.965183633035249e-05, |
|
"loss": 0.0014, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.3441295546558703, |
|
"grad_norm": 0.009689375758171082, |
|
"learning_rate": 9.964511665512179e-05, |
|
"loss": 0.0011, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.3522267206477734, |
|
"grad_norm": 0.007418768480420113, |
|
"learning_rate": 9.963833298314159e-05, |
|
"loss": 0.001, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.360323886639676, |
|
"grad_norm": 0.009091746993362904, |
|
"learning_rate": 9.963148532315663e-05, |
|
"loss": 0.0014, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.3684210526315788, |
|
"grad_norm": 0.007394388318061829, |
|
"learning_rate": 9.962457368399409e-05, |
|
"loss": 0.0012, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.376518218623482, |
|
"grad_norm": 0.011532511562108994, |
|
"learning_rate": 9.96175980745637e-05, |
|
"loss": 0.0008, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 0.009239987470209599, |
|
"learning_rate": 9.961055850385759e-05, |
|
"loss": 0.0015, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.3927125506072873, |
|
"grad_norm": 0.009725586511194706, |
|
"learning_rate": 9.960345498095036e-05, |
|
"loss": 0.0019, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.4008097165991904, |
|
"grad_norm": 0.008276725187897682, |
|
"learning_rate": 9.959628751499906e-05, |
|
"loss": 0.001, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.408906882591093, |
|
"grad_norm": 0.007257894612848759, |
|
"learning_rate": 9.958905611524313e-05, |
|
"loss": 0.0008, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.417004048582996, |
|
"grad_norm": 0.007111871615052223, |
|
"learning_rate": 9.95817607910045e-05, |
|
"loss": 0.0007, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.425101214574899, |
|
"grad_norm": 0.008013184182345867, |
|
"learning_rate": 9.957440155168743e-05, |
|
"loss": 0.0005, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.4331983805668016, |
|
"grad_norm": 0.007757282350212336, |
|
"learning_rate": 9.95669784067786e-05, |
|
"loss": 0.0012, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 3.4412955465587043, |
|
"grad_norm": 0.01001273188740015, |
|
"learning_rate": 9.955949136584709e-05, |
|
"loss": 0.0013, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.4412955465587043, |
|
"eval_loss": 0.0021295032929629087, |
|
"eval_runtime": 20.8824, |
|
"eval_samples_per_second": 4.789, |
|
"eval_steps_per_second": 1.197, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.4493927125506074, |
|
"grad_norm": 0.0063932668417692184, |
|
"learning_rate": 9.95519404385443e-05, |
|
"loss": 0.001, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 3.45748987854251, |
|
"grad_norm": 0.010445266962051392, |
|
"learning_rate": 9.954432563460403e-05, |
|
"loss": 0.0013, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 3.465587044534413, |
|
"grad_norm": 0.006435538176447153, |
|
"learning_rate": 9.953664696384242e-05, |
|
"loss": 0.0007, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 3.473684210526316, |
|
"grad_norm": 0.004963045008480549, |
|
"learning_rate": 9.95289044361579e-05, |
|
"loss": 0.0008, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.4817813765182186, |
|
"grad_norm": 0.02623671293258667, |
|
"learning_rate": 9.952109806153125e-05, |
|
"loss": 0.0009, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.4898785425101213, |
|
"grad_norm": 0.00943893101066351, |
|
"learning_rate": 9.951322785002554e-05, |
|
"loss": 0.0012, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 3.4979757085020244, |
|
"grad_norm": 0.0059456489980220795, |
|
"learning_rate": 9.950529381178617e-05, |
|
"loss": 0.0009, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.506072874493927, |
|
"grad_norm": 0.00898104626685381, |
|
"learning_rate": 9.949729595704076e-05, |
|
"loss": 0.0014, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 3.51417004048583, |
|
"grad_norm": 0.009014531970024109, |
|
"learning_rate": 9.948923429609921e-05, |
|
"loss": 0.0012, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 3.522267206477733, |
|
"grad_norm": 0.012250890955328941, |
|
"learning_rate": 9.948110883935371e-05, |
|
"loss": 0.0024, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.5303643724696356, |
|
"grad_norm": 0.009938407689332962, |
|
"learning_rate": 9.947291959727863e-05, |
|
"loss": 0.0012, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"grad_norm": 0.009354399517178535, |
|
"learning_rate": 9.94646665804306e-05, |
|
"loss": 0.0015, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 3.5465587044534415, |
|
"grad_norm": 0.008449352346360683, |
|
"learning_rate": 9.94563497994485e-05, |
|
"loss": 0.0016, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 3.554655870445344, |
|
"grad_norm": 0.006286781746894121, |
|
"learning_rate": 9.944796926505331e-05, |
|
"loss": 0.0009, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 3.562753036437247, |
|
"grad_norm": 0.00723978690803051, |
|
"learning_rate": 9.943952498804827e-05, |
|
"loss": 0.0012, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.57085020242915, |
|
"grad_norm": 0.00893012247979641, |
|
"learning_rate": 9.943101697931875e-05, |
|
"loss": 0.0016, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 3.5789473684210527, |
|
"grad_norm": 0.00894072838127613, |
|
"learning_rate": 9.942244524983232e-05, |
|
"loss": 0.0017, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 3.5870445344129553, |
|
"grad_norm": 0.005625961814075708, |
|
"learning_rate": 9.941380981063864e-05, |
|
"loss": 0.0008, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 3.5951417004048585, |
|
"grad_norm": 0.010527077130973339, |
|
"learning_rate": 9.940511067286952e-05, |
|
"loss": 0.0012, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 3.603238866396761, |
|
"grad_norm": 0.00621502660214901, |
|
"learning_rate": 9.939634784773892e-05, |
|
"loss": 0.0009, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.611336032388664, |
|
"grad_norm": 0.012210030108690262, |
|
"learning_rate": 9.938752134654282e-05, |
|
"loss": 0.0014, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 3.619433198380567, |
|
"grad_norm": 0.01024533063173294, |
|
"learning_rate": 9.937863118065932e-05, |
|
"loss": 0.0012, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 3.6275303643724697, |
|
"grad_norm": 0.006288249045610428, |
|
"learning_rate": 9.936967736154864e-05, |
|
"loss": 0.0008, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 3.6356275303643724, |
|
"grad_norm": 0.010536898858845234, |
|
"learning_rate": 9.936065990075296e-05, |
|
"loss": 0.0008, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 3.6437246963562755, |
|
"grad_norm": 0.006477988325059414, |
|
"learning_rate": 9.935157880989658e-05, |
|
"loss": 0.0008, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.6437246963562755, |
|
"eval_loss": 0.0018996578874066472, |
|
"eval_runtime": 20.8987, |
|
"eval_samples_per_second": 4.785, |
|
"eval_steps_per_second": 1.196, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.651821862348178, |
|
"grad_norm": 0.009040674194693565, |
|
"learning_rate": 9.93424341006858e-05, |
|
"loss": 0.0018, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 3.659919028340081, |
|
"grad_norm": 0.010683619417250156, |
|
"learning_rate": 9.93332257849089e-05, |
|
"loss": 0.0012, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 3.668016194331984, |
|
"grad_norm": 0.010580274276435375, |
|
"learning_rate": 9.932395387443618e-05, |
|
"loss": 0.0012, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 3.6761133603238867, |
|
"grad_norm": 0.01016503106802702, |
|
"learning_rate": 9.931461838121993e-05, |
|
"loss": 0.0013, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"grad_norm": 0.006146210245788097, |
|
"learning_rate": 9.930521931729439e-05, |
|
"loss": 0.0006, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 0.0124620720744133, |
|
"learning_rate": 9.929575669477572e-05, |
|
"loss": 0.0015, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 3.700404858299595, |
|
"grad_norm": 0.011394270695745945, |
|
"learning_rate": 9.928623052586207e-05, |
|
"loss": 0.0012, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 3.708502024291498, |
|
"grad_norm": 0.005241707898676395, |
|
"learning_rate": 9.927664082283345e-05, |
|
"loss": 0.0005, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 3.716599190283401, |
|
"grad_norm": 0.00538614671677351, |
|
"learning_rate": 9.926698759805184e-05, |
|
"loss": 0.0006, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 3.7246963562753037, |
|
"grad_norm": 0.011280486360192299, |
|
"learning_rate": 9.925727086396101e-05, |
|
"loss": 0.0015, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.7327935222672064, |
|
"grad_norm": 0.006988388951867819, |
|
"learning_rate": 9.924749063308668e-05, |
|
"loss": 0.001, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 3.7408906882591095, |
|
"grad_norm": 0.011907723732292652, |
|
"learning_rate": 9.923764691803639e-05, |
|
"loss": 0.0021, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 3.748987854251012, |
|
"grad_norm": 0.012228334322571754, |
|
"learning_rate": 9.922773973149953e-05, |
|
"loss": 0.0011, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 3.757085020242915, |
|
"grad_norm": 0.00736306793987751, |
|
"learning_rate": 9.921776908624727e-05, |
|
"loss": 0.0013, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.765182186234818, |
|
"grad_norm": 0.007194120436906815, |
|
"learning_rate": 9.920773499513266e-05, |
|
"loss": 0.0009, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.7732793522267207, |
|
"grad_norm": 0.00893466267734766, |
|
"learning_rate": 9.919763747109043e-05, |
|
"loss": 0.0012, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.7813765182186234, |
|
"grad_norm": 0.009740286506712437, |
|
"learning_rate": 9.91874765271372e-05, |
|
"loss": 0.0019, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.7894736842105265, |
|
"grad_norm": 0.004781534895300865, |
|
"learning_rate": 9.917725217637126e-05, |
|
"loss": 0.0006, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.7975708502024292, |
|
"grad_norm": 0.011936492286622524, |
|
"learning_rate": 9.916696443197267e-05, |
|
"loss": 0.0015, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.805668016194332, |
|
"grad_norm": 0.008463852107524872, |
|
"learning_rate": 9.91566133072032e-05, |
|
"loss": 0.0012, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.813765182186235, |
|
"grad_norm": 0.008224911987781525, |
|
"learning_rate": 9.914619881540629e-05, |
|
"loss": 0.0011, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.8218623481781377, |
|
"grad_norm": 0.00846084114164114, |
|
"learning_rate": 9.913572097000716e-05, |
|
"loss": 0.0011, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.8299595141700404, |
|
"grad_norm": 0.006916975602507591, |
|
"learning_rate": 9.912517978451259e-05, |
|
"loss": 0.001, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.8380566801619436, |
|
"grad_norm": 0.015935301780700684, |
|
"learning_rate": 9.911457527251109e-05, |
|
"loss": 0.0011, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 0.009782219305634499, |
|
"learning_rate": 9.910390744767275e-05, |
|
"loss": 0.0014, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"eval_loss": 0.0019529929850250483, |
|
"eval_runtime": 20.908, |
|
"eval_samples_per_second": 4.783, |
|
"eval_steps_per_second": 1.196, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.854251012145749, |
|
"grad_norm": 0.008936782367527485, |
|
"learning_rate": 9.90931763237493e-05, |
|
"loss": 0.0015, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.862348178137652, |
|
"grad_norm": 0.006487220525741577, |
|
"learning_rate": 9.908238191457409e-05, |
|
"loss": 0.0011, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.8704453441295548, |
|
"grad_norm": 0.011362340301275253, |
|
"learning_rate": 9.907152423406199e-05, |
|
"loss": 0.0019, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.8785425101214575, |
|
"grad_norm": 0.011046521365642548, |
|
"learning_rate": 9.906060329620949e-05, |
|
"loss": 0.0017, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.8866396761133606, |
|
"grad_norm": 0.005723021924495697, |
|
"learning_rate": 9.904961911509459e-05, |
|
"loss": 0.0007, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.8947368421052633, |
|
"grad_norm": 0.004767613019794226, |
|
"learning_rate": 9.903857170487684e-05, |
|
"loss": 0.0006, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.902834008097166, |
|
"grad_norm": 0.0070763761177659035, |
|
"learning_rate": 9.902746107979728e-05, |
|
"loss": 0.0008, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.910931174089069, |
|
"grad_norm": 0.011897820979356766, |
|
"learning_rate": 9.901628725417843e-05, |
|
"loss": 0.0012, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.919028340080972, |
|
"grad_norm": 0.006268302444368601, |
|
"learning_rate": 9.900505024242431e-05, |
|
"loss": 0.0007, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.9271255060728745, |
|
"grad_norm": 0.007625557016581297, |
|
"learning_rate": 9.899375005902038e-05, |
|
"loss": 0.0019, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.9352226720647776, |
|
"grad_norm": 0.007365250959992409, |
|
"learning_rate": 9.898238671853352e-05, |
|
"loss": 0.001, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.9433198380566803, |
|
"grad_norm": 0.007656946778297424, |
|
"learning_rate": 9.897096023561205e-05, |
|
"loss": 0.0009, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.951417004048583, |
|
"grad_norm": 0.006735661532729864, |
|
"learning_rate": 9.895947062498566e-05, |
|
"loss": 0.0006, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.9595141700404857, |
|
"grad_norm": 0.009474151767790318, |
|
"learning_rate": 9.894791790146542e-05, |
|
"loss": 0.0015, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.967611336032389, |
|
"grad_norm": 0.00841295812278986, |
|
"learning_rate": 9.89363020799438e-05, |
|
"loss": 0.0011, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.9757085020242915, |
|
"grad_norm": 0.008091527037322521, |
|
"learning_rate": 9.892462317539455e-05, |
|
"loss": 0.0007, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.983805668016194, |
|
"grad_norm": 0.01014798879623413, |
|
"learning_rate": 9.891288120287276e-05, |
|
"loss": 0.0011, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.9919028340080973, |
|
"grad_norm": 0.008167481981217861, |
|
"learning_rate": 9.890107617751484e-05, |
|
"loss": 0.0008, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.006659403908997774, |
|
"learning_rate": 9.888920811453846e-05, |
|
"loss": 0.0006, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 4.008097165991903, |
|
"grad_norm": 0.0071459268219769, |
|
"learning_rate": 9.887727702924255e-05, |
|
"loss": 0.0008, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.016194331983805, |
|
"grad_norm": 0.007468671537935734, |
|
"learning_rate": 9.886528293700729e-05, |
|
"loss": 0.0007, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 4.0242914979757085, |
|
"grad_norm": 0.0065126921981573105, |
|
"learning_rate": 9.885322585329409e-05, |
|
"loss": 0.0005, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 4.032388663967612, |
|
"grad_norm": 0.007633598055690527, |
|
"learning_rate": 9.884110579364552e-05, |
|
"loss": 0.0007, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 4.040485829959514, |
|
"grad_norm": 0.00944494642317295, |
|
"learning_rate": 9.882892277368538e-05, |
|
"loss": 0.0006, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 4.048582995951417, |
|
"grad_norm": 0.011854424141347408, |
|
"learning_rate": 9.881667680911862e-05, |
|
"loss": 0.0009, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.048582995951417, |
|
"eval_loss": 0.001951522775925696, |
|
"eval_runtime": 20.8898, |
|
"eval_samples_per_second": 4.787, |
|
"eval_steps_per_second": 1.197, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.05668016194332, |
|
"grad_norm": 0.007982923649251461, |
|
"learning_rate": 9.880436791573133e-05, |
|
"loss": 0.001, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 4.064777327935222, |
|
"grad_norm": 0.007158339489251375, |
|
"learning_rate": 9.879199610939067e-05, |
|
"loss": 0.0005, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 4.0728744939271255, |
|
"grad_norm": 0.006214539520442486, |
|
"learning_rate": 9.877956140604498e-05, |
|
"loss": 0.0005, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 4.080971659919029, |
|
"grad_norm": 0.00849990639835596, |
|
"learning_rate": 9.876706382172365e-05, |
|
"loss": 0.0007, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 4.089068825910931, |
|
"grad_norm": 0.00827990472316742, |
|
"learning_rate": 9.87545033725371e-05, |
|
"loss": 0.001, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.097165991902834, |
|
"grad_norm": 0.010654132813215256, |
|
"learning_rate": 9.874188007467681e-05, |
|
"loss": 0.0013, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 4.105263157894737, |
|
"grad_norm": 0.00579674681648612, |
|
"learning_rate": 9.872919394441529e-05, |
|
"loss": 0.0005, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 4.113360323886639, |
|
"grad_norm": 0.006263192277401686, |
|
"learning_rate": 9.871644499810601e-05, |
|
"loss": 0.0006, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 4.1214574898785425, |
|
"grad_norm": 0.008381963707506657, |
|
"learning_rate": 9.870363325218349e-05, |
|
"loss": 0.0012, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 4.129554655870446, |
|
"grad_norm": 0.011459080502390862, |
|
"learning_rate": 9.86907587231631e-05, |
|
"loss": 0.0012, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.137651821862348, |
|
"grad_norm": 0.004725282080471516, |
|
"learning_rate": 9.867782142764122e-05, |
|
"loss": 0.0006, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 4.145748987854251, |
|
"grad_norm": 0.007089782506227493, |
|
"learning_rate": 9.866482138229511e-05, |
|
"loss": 0.0008, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"grad_norm": 0.005252163391560316, |
|
"learning_rate": 9.865175860388293e-05, |
|
"loss": 0.0006, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 4.161943319838056, |
|
"grad_norm": 0.006663177162408829, |
|
"learning_rate": 9.86386331092437e-05, |
|
"loss": 0.0007, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 4.17004048582996, |
|
"grad_norm": 0.007172934245318174, |
|
"learning_rate": 9.86254449152973e-05, |
|
"loss": 0.0006, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.178137651821863, |
|
"grad_norm": 0.006407279521226883, |
|
"learning_rate": 9.861219403904442e-05, |
|
"loss": 0.0005, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 4.186234817813765, |
|
"grad_norm": 0.007719367276877165, |
|
"learning_rate": 9.859888049756656e-05, |
|
"loss": 0.0008, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 4.194331983805668, |
|
"grad_norm": 0.008337226696312428, |
|
"learning_rate": 9.8585504308026e-05, |
|
"loss": 0.0007, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 4.202429149797571, |
|
"grad_norm": 0.008549238555133343, |
|
"learning_rate": 9.857206548766576e-05, |
|
"loss": 0.0005, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"grad_norm": 0.010102051310241222, |
|
"learning_rate": 9.855856405380966e-05, |
|
"loss": 0.0009, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.218623481781377, |
|
"grad_norm": 0.005718359258025885, |
|
"learning_rate": 9.854500002386215e-05, |
|
"loss": 0.0006, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 4.22672064777328, |
|
"grad_norm": 0.00954069197177887, |
|
"learning_rate": 9.853137341530842e-05, |
|
"loss": 0.0009, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 4.234817813765182, |
|
"grad_norm": 0.008051340468227863, |
|
"learning_rate": 9.851768424571433e-05, |
|
"loss": 0.0007, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 4.242914979757085, |
|
"grad_norm": 0.007244836073368788, |
|
"learning_rate": 9.850393253272637e-05, |
|
"loss": 0.0008, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 4.251012145748988, |
|
"grad_norm": 0.00602039834484458, |
|
"learning_rate": 9.849011829407166e-05, |
|
"loss": 0.0006, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.251012145748988, |
|
"eval_loss": 0.001924480078741908, |
|
"eval_runtime": 20.9917, |
|
"eval_samples_per_second": 4.764, |
|
"eval_steps_per_second": 1.191, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.2591093117408905, |
|
"grad_norm": 0.007480318192392588, |
|
"learning_rate": 9.84762415475579e-05, |
|
"loss": 0.001, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 4.267206477732794, |
|
"grad_norm": 0.008988871239125729, |
|
"learning_rate": 9.846230231107343e-05, |
|
"loss": 0.0008, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 4.275303643724697, |
|
"grad_norm": 0.00547422282397747, |
|
"learning_rate": 9.844830060258707e-05, |
|
"loss": 0.0007, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 4.283400809716599, |
|
"grad_norm": 0.004437068942934275, |
|
"learning_rate": 9.843423644014822e-05, |
|
"loss": 0.0006, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 4.291497975708502, |
|
"grad_norm": 0.0038599406834691763, |
|
"learning_rate": 9.842010984188676e-05, |
|
"loss": 0.0004, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.299595141700405, |
|
"grad_norm": 0.0072926743887364864, |
|
"learning_rate": 9.840592082601309e-05, |
|
"loss": 0.0007, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 0.006621995009481907, |
|
"learning_rate": 9.839166941081804e-05, |
|
"loss": 0.0011, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 4.315789473684211, |
|
"grad_norm": 0.008673852309584618, |
|
"learning_rate": 9.837735561467288e-05, |
|
"loss": 0.0012, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 4.323886639676114, |
|
"grad_norm": 0.004889126401394606, |
|
"learning_rate": 9.836297945602931e-05, |
|
"loss": 0.0005, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 4.331983805668016, |
|
"grad_norm": 0.010923854075372219, |
|
"learning_rate": 9.83485409534194e-05, |
|
"loss": 0.0013, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.340080971659919, |
|
"grad_norm": 0.013469451107084751, |
|
"learning_rate": 9.833404012545562e-05, |
|
"loss": 0.001, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 4.348178137651822, |
|
"grad_norm": 0.004695568699389696, |
|
"learning_rate": 9.831947699083076e-05, |
|
"loss": 0.0004, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 4.3562753036437245, |
|
"grad_norm": 0.005549240857362747, |
|
"learning_rate": 9.830485156831792e-05, |
|
"loss": 0.0004, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 4.364372469635628, |
|
"grad_norm": 0.004222598858177662, |
|
"learning_rate": 9.829016387677051e-05, |
|
"loss": 0.0004, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 4.372469635627531, |
|
"grad_norm": 0.0063895429484546185, |
|
"learning_rate": 9.827541393512221e-05, |
|
"loss": 0.0007, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.380566801619433, |
|
"grad_norm": 0.006316315848380327, |
|
"learning_rate": 9.826060176238693e-05, |
|
"loss": 0.0004, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 4.388663967611336, |
|
"grad_norm": 0.004954824224114418, |
|
"learning_rate": 9.824572737765883e-05, |
|
"loss": 0.0004, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 4.396761133603239, |
|
"grad_norm": 0.005867576692253351, |
|
"learning_rate": 9.823079080011222e-05, |
|
"loss": 0.0006, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 4.4048582995951415, |
|
"grad_norm": 0.004107177723199129, |
|
"learning_rate": 9.821579204900164e-05, |
|
"loss": 0.0003, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 4.412955465587045, |
|
"grad_norm": 0.008971895091235638, |
|
"learning_rate": 9.820073114366173e-05, |
|
"loss": 0.001, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 4.421052631578947, |
|
"grad_norm": 0.004289721138775349, |
|
"learning_rate": 9.818560810350727e-05, |
|
"loss": 0.0004, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 4.42914979757085, |
|
"grad_norm": 0.011725863441824913, |
|
"learning_rate": 9.817042294803314e-05, |
|
"loss": 0.0007, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 4.437246963562753, |
|
"grad_norm": 0.007441469002515078, |
|
"learning_rate": 9.81551756968143e-05, |
|
"loss": 0.0007, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 4.445344129554655, |
|
"grad_norm": 0.009847107343375683, |
|
"learning_rate": 9.813986636950572e-05, |
|
"loss": 0.0012, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 4.4534412955465585, |
|
"grad_norm": 0.008748643100261688, |
|
"learning_rate": 9.812449498584245e-05, |
|
"loss": 0.0005, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.4534412955465585, |
|
"eval_loss": 0.0018015182577073574, |
|
"eval_runtime": 20.9424, |
|
"eval_samples_per_second": 4.775, |
|
"eval_steps_per_second": 1.194, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"grad_norm": 0.007471165154129267, |
|
"learning_rate": 9.810906156563946e-05, |
|
"loss": 0.0006, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 4.469635627530364, |
|
"grad_norm": 0.008223461918532848, |
|
"learning_rate": 9.809356612879175e-05, |
|
"loss": 0.0008, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 4.477732793522267, |
|
"grad_norm": 0.006764471530914307, |
|
"learning_rate": 9.807800869527426e-05, |
|
"loss": 0.0005, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 4.48582995951417, |
|
"grad_norm": 0.01072748377919197, |
|
"learning_rate": 9.806238928514184e-05, |
|
"loss": 0.0012, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 4.493927125506072, |
|
"grad_norm": 0.011665924452245235, |
|
"learning_rate": 9.80467079185292e-05, |
|
"loss": 0.0006, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 4.502024291497976, |
|
"grad_norm": 0.012857378460466862, |
|
"learning_rate": 9.803096461565098e-05, |
|
"loss": 0.0005, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 4.510121457489879, |
|
"grad_norm": 0.007671073079109192, |
|
"learning_rate": 9.801515939680159e-05, |
|
"loss": 0.0006, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 4.518218623481781, |
|
"grad_norm": 0.008667264133691788, |
|
"learning_rate": 9.799929228235532e-05, |
|
"loss": 0.0009, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 4.526315789473684, |
|
"grad_norm": 0.009722660295665264, |
|
"learning_rate": 9.798336329276623e-05, |
|
"loss": 0.0009, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 4.534412955465587, |
|
"grad_norm": 0.00602738605812192, |
|
"learning_rate": 9.796737244856811e-05, |
|
"loss": 0.0006, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.5425101214574894, |
|
"grad_norm": 0.006649456452578306, |
|
"learning_rate": 9.795131977037451e-05, |
|
"loss": 0.0006, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 4.550607287449393, |
|
"grad_norm": 0.009024589322507381, |
|
"learning_rate": 9.79352052788787e-05, |
|
"loss": 0.0006, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 4.558704453441296, |
|
"grad_norm": 0.0069068074226379395, |
|
"learning_rate": 9.79190289948536e-05, |
|
"loss": 0.0009, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 4.566801619433198, |
|
"grad_norm": 0.007059336174279451, |
|
"learning_rate": 9.790279093915183e-05, |
|
"loss": 0.0008, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 4.574898785425101, |
|
"grad_norm": 0.005149452481418848, |
|
"learning_rate": 9.788649113270562e-05, |
|
"loss": 0.0005, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 4.582995951417004, |
|
"grad_norm": 0.008085817098617554, |
|
"learning_rate": 9.787012959652677e-05, |
|
"loss": 0.0009, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 4.5910931174089065, |
|
"grad_norm": 0.005893132649362087, |
|
"learning_rate": 9.785370635170671e-05, |
|
"loss": 0.0008, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 4.59919028340081, |
|
"grad_norm": 0.006068812217563391, |
|
"learning_rate": 9.783722141941636e-05, |
|
"loss": 0.0007, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 4.607287449392713, |
|
"grad_norm": 0.00651150569319725, |
|
"learning_rate": 9.782067482090624e-05, |
|
"loss": 0.0006, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 0.008106587454676628, |
|
"learning_rate": 9.780406657750626e-05, |
|
"loss": 0.001, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.623481781376518, |
|
"grad_norm": 0.00984260905534029, |
|
"learning_rate": 9.778739671062586e-05, |
|
"loss": 0.0007, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 4.631578947368421, |
|
"grad_norm": 0.011000724509358406, |
|
"learning_rate": 9.777066524175394e-05, |
|
"loss": 0.001, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 4.6396761133603235, |
|
"grad_norm": 0.00920114666223526, |
|
"learning_rate": 9.775387219245876e-05, |
|
"loss": 0.0012, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 4.647773279352227, |
|
"grad_norm": 0.008547363802790642, |
|
"learning_rate": 9.773701758438796e-05, |
|
"loss": 0.0008, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 4.65587044534413, |
|
"grad_norm": 0.011154532432556152, |
|
"learning_rate": 9.772010143926856e-05, |
|
"loss": 0.0007, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.65587044534413, |
|
"eval_loss": 0.0017410250147804618, |
|
"eval_runtime": 20.8952, |
|
"eval_samples_per_second": 4.786, |
|
"eval_steps_per_second": 1.196, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.663967611336032, |
|
"grad_norm": 0.008925629779696465, |
|
"learning_rate": 9.77031237789069e-05, |
|
"loss": 0.0009, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 4.672064777327935, |
|
"grad_norm": 0.006519634742289782, |
|
"learning_rate": 9.768608462518865e-05, |
|
"loss": 0.0007, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 4.680161943319838, |
|
"grad_norm": 0.0046993098221719265, |
|
"learning_rate": 9.766898400007869e-05, |
|
"loss": 0.0004, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 4.6882591093117405, |
|
"grad_norm": 0.011550773866474628, |
|
"learning_rate": 9.765182192562117e-05, |
|
"loss": 0.0007, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 4.696356275303644, |
|
"grad_norm": 0.008509055711328983, |
|
"learning_rate": 9.763459842393945e-05, |
|
"loss": 0.0007, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.704453441295547, |
|
"grad_norm": 0.009534220211207867, |
|
"learning_rate": 9.76173135172361e-05, |
|
"loss": 0.0012, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 4.712550607287449, |
|
"grad_norm": 0.008667496033012867, |
|
"learning_rate": 9.759996722779281e-05, |
|
"loss": 0.0006, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 4.720647773279352, |
|
"grad_norm": 0.004869160708039999, |
|
"learning_rate": 9.758255957797042e-05, |
|
"loss": 0.0004, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 4.728744939271255, |
|
"grad_norm": 0.007150961086153984, |
|
"learning_rate": 9.756509059020884e-05, |
|
"loss": 0.0006, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 4.7368421052631575, |
|
"grad_norm": 0.008666688576340675, |
|
"learning_rate": 9.75475602870271e-05, |
|
"loss": 0.0008, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 4.744939271255061, |
|
"grad_norm": 0.006314212456345558, |
|
"learning_rate": 9.752996869102322e-05, |
|
"loss": 0.001, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 4.753036437246964, |
|
"grad_norm": 0.007860385812819004, |
|
"learning_rate": 9.751231582487428e-05, |
|
"loss": 0.0007, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 4.761133603238866, |
|
"grad_norm": 0.007355378940701485, |
|
"learning_rate": 9.749460171133629e-05, |
|
"loss": 0.001, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 4.769230769230769, |
|
"grad_norm": 0.008818419650197029, |
|
"learning_rate": 9.747682637324425e-05, |
|
"loss": 0.0007, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 4.777327935222672, |
|
"grad_norm": 0.008113800548017025, |
|
"learning_rate": 9.745898983351204e-05, |
|
"loss": 0.0008, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.7854251012145745, |
|
"grad_norm": 0.006416656076908112, |
|
"learning_rate": 9.744109211513253e-05, |
|
"loss": 0.0009, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 4.793522267206478, |
|
"grad_norm": 0.009136557579040527, |
|
"learning_rate": 9.742313324117736e-05, |
|
"loss": 0.0006, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 4.801619433198381, |
|
"grad_norm": 0.008266017772257328, |
|
"learning_rate": 9.740511323479702e-05, |
|
"loss": 0.0012, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 4.809716599190283, |
|
"grad_norm": 0.005936949979513884, |
|
"learning_rate": 9.738703211922084e-05, |
|
"loss": 0.0008, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 4.817813765182186, |
|
"grad_norm": 0.00802876427769661, |
|
"learning_rate": 9.736888991775688e-05, |
|
"loss": 0.0006, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 4.825910931174089, |
|
"grad_norm": 0.006083488930016756, |
|
"learning_rate": 9.735068665379201e-05, |
|
"loss": 0.0008, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 4.834008097165992, |
|
"grad_norm": 0.0064203874208033085, |
|
"learning_rate": 9.733242235079175e-05, |
|
"loss": 0.0008, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 4.842105263157895, |
|
"grad_norm": 0.0068838950246572495, |
|
"learning_rate": 9.731409703230035e-05, |
|
"loss": 0.0006, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 4.850202429149798, |
|
"grad_norm": 0.006466129794716835, |
|
"learning_rate": 9.729571072194066e-05, |
|
"loss": 0.0005, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 4.8582995951417, |
|
"grad_norm": 0.004359446931630373, |
|
"learning_rate": 9.727726344341419e-05, |
|
"loss": 0.0004, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.8582995951417, |
|
"eval_loss": 0.0015752798644825816, |
|
"eval_runtime": 20.8835, |
|
"eval_samples_per_second": 4.788, |
|
"eval_steps_per_second": 1.197, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.866396761133603, |
|
"grad_norm": 0.008679832331836224, |
|
"learning_rate": 9.725875522050107e-05, |
|
"loss": 0.0012, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 4.874493927125506, |
|
"grad_norm": 0.0073195514269173145, |
|
"learning_rate": 9.724018607705995e-05, |
|
"loss": 0.0013, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 4.882591093117409, |
|
"grad_norm": 0.00781306903809309, |
|
"learning_rate": 9.722155603702804e-05, |
|
"loss": 0.0008, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 4.890688259109312, |
|
"grad_norm": 0.009774848818778992, |
|
"learning_rate": 9.7202865124421e-05, |
|
"loss": 0.0006, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 4.898785425101215, |
|
"grad_norm": 0.006310292985290289, |
|
"learning_rate": 9.718411336333301e-05, |
|
"loss": 0.0006, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 4.906882591093117, |
|
"grad_norm": 0.00883979070931673, |
|
"learning_rate": 9.71653007779367e-05, |
|
"loss": 0.0014, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 4.91497975708502, |
|
"grad_norm": 0.008740267716348171, |
|
"learning_rate": 9.714642739248305e-05, |
|
"loss": 0.0008, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 0.003794717602431774, |
|
"learning_rate": 9.712749323130146e-05, |
|
"loss": 0.0004, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 4.931174089068826, |
|
"grad_norm": 0.007664462551474571, |
|
"learning_rate": 9.710849831879967e-05, |
|
"loss": 0.0009, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 4.939271255060729, |
|
"grad_norm": 0.007098712492734194, |
|
"learning_rate": 9.708944267946369e-05, |
|
"loss": 0.0007, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.947368421052632, |
|
"grad_norm": 0.004916313569992781, |
|
"learning_rate": 9.70703263378579e-05, |
|
"loss": 0.0008, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 4.955465587044534, |
|
"grad_norm": 0.006048915442079306, |
|
"learning_rate": 9.705114931862486e-05, |
|
"loss": 0.0009, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 4.963562753036437, |
|
"grad_norm": 0.007562866434454918, |
|
"learning_rate": 9.703191164648537e-05, |
|
"loss": 0.0006, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 4.97165991902834, |
|
"grad_norm": 0.006720076780766249, |
|
"learning_rate": 9.70126133462384e-05, |
|
"loss": 0.0007, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 4.979757085020243, |
|
"grad_norm": 0.005906874779611826, |
|
"learning_rate": 9.699325444276109e-05, |
|
"loss": 0.0004, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 4.987854251012146, |
|
"grad_norm": 0.004341105464845896, |
|
"learning_rate": 9.697383496100872e-05, |
|
"loss": 0.0004, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 4.995951417004049, |
|
"grad_norm": 0.005335621070116758, |
|
"learning_rate": 9.695435492601464e-05, |
|
"loss": 0.0004, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 5.004048582995951, |
|
"grad_norm": 0.010734565556049347, |
|
"learning_rate": 9.693481436289025e-05, |
|
"loss": 0.0014, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 5.012145748987854, |
|
"grad_norm": 0.0031302126590162516, |
|
"learning_rate": 9.691521329682499e-05, |
|
"loss": 0.0003, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 5.020242914979757, |
|
"grad_norm": 0.006032292731106281, |
|
"learning_rate": 9.68955517530863e-05, |
|
"loss": 0.0004, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.02834008097166, |
|
"grad_norm": 0.005305845756083727, |
|
"learning_rate": 9.687582975701956e-05, |
|
"loss": 0.0005, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 5.036437246963563, |
|
"grad_norm": 0.008457905612885952, |
|
"learning_rate": 9.685604733404808e-05, |
|
"loss": 0.0004, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 5.044534412955466, |
|
"grad_norm": 0.00620026420801878, |
|
"learning_rate": 9.68362045096731e-05, |
|
"loss": 0.0004, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 5.052631578947368, |
|
"grad_norm": 0.008932286873459816, |
|
"learning_rate": 9.681630130947367e-05, |
|
"loss": 0.0003, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 5.060728744939271, |
|
"grad_norm": 0.007199987303465605, |
|
"learning_rate": 9.679633775910672e-05, |
|
"loss": 0.0004, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 5.060728744939271, |
|
"eval_loss": 0.0016215384239330888, |
|
"eval_runtime": 20.8859, |
|
"eval_samples_per_second": 4.788, |
|
"eval_steps_per_second": 1.197, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 5.068825910931174, |
|
"grad_norm": 0.0068572270683944225, |
|
"learning_rate": 9.677631388430694e-05, |
|
"loss": 0.0004, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 5.076923076923077, |
|
"grad_norm": 0.007633959408849478, |
|
"learning_rate": 9.675622971088681e-05, |
|
"loss": 0.0006, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 5.08502024291498, |
|
"grad_norm": 0.005319299641996622, |
|
"learning_rate": 9.673608526473649e-05, |
|
"loss": 0.0007, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 5.093117408906883, |
|
"grad_norm": 0.003859872929751873, |
|
"learning_rate": 9.671588057182391e-05, |
|
"loss": 0.0003, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 5.101214574898785, |
|
"grad_norm": 0.007842942140996456, |
|
"learning_rate": 9.669561565819463e-05, |
|
"loss": 0.0004, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.109311740890688, |
|
"grad_norm": 0.007960239425301552, |
|
"learning_rate": 9.66752905499718e-05, |
|
"loss": 0.0004, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 5.117408906882591, |
|
"grad_norm": 0.0059306202456355095, |
|
"learning_rate": 9.665490527335622e-05, |
|
"loss": 0.0004, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 5.125506072874494, |
|
"grad_norm": 0.006550755817443132, |
|
"learning_rate": 9.663445985462624e-05, |
|
"loss": 0.0006, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 5.133603238866397, |
|
"grad_norm": 0.009190104901790619, |
|
"learning_rate": 9.661395432013773e-05, |
|
"loss": 0.0006, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 5.1417004048583, |
|
"grad_norm": 0.0046551162376999855, |
|
"learning_rate": 9.659338869632406e-05, |
|
"loss": 0.0005, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 5.149797570850202, |
|
"grad_norm": 0.0077586546540260315, |
|
"learning_rate": 9.657276300969604e-05, |
|
"loss": 0.0005, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 5.157894736842105, |
|
"grad_norm": 0.009180366061627865, |
|
"learning_rate": 9.655207728684194e-05, |
|
"loss": 0.0007, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 5.165991902834008, |
|
"grad_norm": 0.003306223312392831, |
|
"learning_rate": 9.65313315544274e-05, |
|
"loss": 0.0003, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 5.174089068825911, |
|
"grad_norm": 0.0038234649691730738, |
|
"learning_rate": 9.65105258391954e-05, |
|
"loss": 0.0004, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 5.182186234817814, |
|
"grad_norm": 0.00610633147880435, |
|
"learning_rate": 9.64896601679663e-05, |
|
"loss": 0.0005, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.190283400809717, |
|
"grad_norm": 0.004961833823472261, |
|
"learning_rate": 9.64687345676377e-05, |
|
"loss": 0.0005, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 5.198380566801619, |
|
"grad_norm": 0.004935056436806917, |
|
"learning_rate": 9.644774906518445e-05, |
|
"loss": 0.0006, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 5.206477732793522, |
|
"grad_norm": 0.005315741058439016, |
|
"learning_rate": 9.642670368765865e-05, |
|
"loss": 0.0006, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 5.2145748987854255, |
|
"grad_norm": 0.0038190498016774654, |
|
"learning_rate": 9.640559846218958e-05, |
|
"loss": 0.0004, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 5.222672064777328, |
|
"grad_norm": 0.005875582341104746, |
|
"learning_rate": 9.638443341598364e-05, |
|
"loss": 0.0004, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 5.230769230769231, |
|
"grad_norm": 0.006707495544105768, |
|
"learning_rate": 9.636320857632437e-05, |
|
"loss": 0.0005, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 5.238866396761134, |
|
"grad_norm": 0.0063120415434241295, |
|
"learning_rate": 9.634192397057238e-05, |
|
"loss": 0.0005, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 5.246963562753036, |
|
"grad_norm": 0.009964760392904282, |
|
"learning_rate": 9.632057962616531e-05, |
|
"loss": 0.0008, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 5.255060728744939, |
|
"grad_norm": 0.005848821718245745, |
|
"learning_rate": 9.629917557061787e-05, |
|
"loss": 0.0005, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 5.2631578947368425, |
|
"grad_norm": 0.004280323162674904, |
|
"learning_rate": 9.627771183152164e-05, |
|
"loss": 0.0004, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.2631578947368425, |
|
"eval_loss": 0.0015275988262146711, |
|
"eval_runtime": 20.8893, |
|
"eval_samples_per_second": 4.787, |
|
"eval_steps_per_second": 1.197, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.271255060728745, |
|
"grad_norm": 0.004903740715235472, |
|
"learning_rate": 9.625618843654523e-05, |
|
"loss": 0.0005, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 5.279352226720648, |
|
"grad_norm": 0.005046170204877853, |
|
"learning_rate": 9.62346054134341e-05, |
|
"loss": 0.0005, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 5.287449392712551, |
|
"grad_norm": 0.005617137067019939, |
|
"learning_rate": 9.621296279001059e-05, |
|
"loss": 0.0004, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 5.295546558704453, |
|
"grad_norm": 0.004350410774350166, |
|
"learning_rate": 9.619126059417387e-05, |
|
"loss": 0.0004, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 5.303643724696356, |
|
"grad_norm": 0.005870455875992775, |
|
"learning_rate": 9.616949885389991e-05, |
|
"loss": 0.0005, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 5.3117408906882595, |
|
"grad_norm": 0.009995516389608383, |
|
"learning_rate": 9.614767759724143e-05, |
|
"loss": 0.0007, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 5.319838056680162, |
|
"grad_norm": 0.002616090467199683, |
|
"learning_rate": 9.612579685232788e-05, |
|
"loss": 0.0003, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 5.327935222672065, |
|
"grad_norm": 0.005312880035489798, |
|
"learning_rate": 9.610385664736536e-05, |
|
"loss": 0.0005, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 5.336032388663968, |
|
"grad_norm": 0.006149233318865299, |
|
"learning_rate": 9.60818570106367e-05, |
|
"loss": 0.0004, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 5.34412955465587, |
|
"grad_norm": 0.005688710603863001, |
|
"learning_rate": 9.605979797050124e-05, |
|
"loss": 0.0006, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.352226720647773, |
|
"grad_norm": 0.00494812149554491, |
|
"learning_rate": 9.603767955539495e-05, |
|
"loss": 0.0004, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 5.3603238866396765, |
|
"grad_norm": 0.004429470282047987, |
|
"learning_rate": 9.601550179383036e-05, |
|
"loss": 0.0006, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 5.368421052631579, |
|
"grad_norm": 0.006522475741803646, |
|
"learning_rate": 9.599326471439647e-05, |
|
"loss": 0.0005, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 5.376518218623482, |
|
"grad_norm": 0.005521293263882399, |
|
"learning_rate": 9.597096834575877e-05, |
|
"loss": 0.0005, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"grad_norm": 0.0048112692311406136, |
|
"learning_rate": 9.594861271665912e-05, |
|
"loss": 0.0004, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 5.392712550607287, |
|
"grad_norm": 0.007786073721945286, |
|
"learning_rate": 9.592619785591586e-05, |
|
"loss": 0.0005, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 5.40080971659919, |
|
"grad_norm": 0.007375451736152172, |
|
"learning_rate": 9.59037237924236e-05, |
|
"loss": 0.0005, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 5.4089068825910935, |
|
"grad_norm": 0.008655287325382233, |
|
"learning_rate": 9.588119055515333e-05, |
|
"loss": 0.0005, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 5.417004048582996, |
|
"grad_norm": 0.002889828523620963, |
|
"learning_rate": 9.58585981731523e-05, |
|
"loss": 0.0003, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 5.425101214574899, |
|
"grad_norm": 0.00909927673637867, |
|
"learning_rate": 9.583594667554399e-05, |
|
"loss": 0.0006, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 5.433198380566802, |
|
"grad_norm": 0.003937150351703167, |
|
"learning_rate": 9.581323609152808e-05, |
|
"loss": 0.0003, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 5.441295546558704, |
|
"grad_norm": 0.010027210228145123, |
|
"learning_rate": 9.579046645038047e-05, |
|
"loss": 0.0009, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 5.449392712550607, |
|
"grad_norm": 0.006260544527322054, |
|
"learning_rate": 9.576763778145312e-05, |
|
"loss": 0.0005, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 5.4574898785425106, |
|
"grad_norm": 0.0034802183508872986, |
|
"learning_rate": 9.574475011417411e-05, |
|
"loss": 0.0004, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 5.465587044534413, |
|
"grad_norm": 0.004040226805955172, |
|
"learning_rate": 9.57218034780476e-05, |
|
"loss": 0.0004, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 5.465587044534413, |
|
"eval_loss": 0.0016189313028007746, |
|
"eval_runtime": 20.8859, |
|
"eval_samples_per_second": 4.788, |
|
"eval_steps_per_second": 1.197, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 5.473684210526316, |
|
"grad_norm": 0.0035269635263830423, |
|
"learning_rate": 9.569879790265373e-05, |
|
"loss": 0.0004, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 5.481781376518219, |
|
"grad_norm": 0.005436853971332312, |
|
"learning_rate": 9.567573341764862e-05, |
|
"loss": 0.0003, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 5.489878542510121, |
|
"grad_norm": 0.007649141363799572, |
|
"learning_rate": 9.565261005276435e-05, |
|
"loss": 0.0007, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 5.497975708502024, |
|
"grad_norm": 0.006564602255821228, |
|
"learning_rate": 9.562942783780891e-05, |
|
"loss": 0.0006, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 5.506072874493928, |
|
"grad_norm": 0.005341153126209974, |
|
"learning_rate": 9.560618680266609e-05, |
|
"loss": 0.0003, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.51417004048583, |
|
"grad_norm": 0.005836586467921734, |
|
"learning_rate": 9.558288697729559e-05, |
|
"loss": 0.0006, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 5.522267206477733, |
|
"grad_norm": 0.009045450948178768, |
|
"learning_rate": 9.555952839173282e-05, |
|
"loss": 0.0008, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 5.530364372469636, |
|
"grad_norm": 0.0071211401373147964, |
|
"learning_rate": 9.5536111076089e-05, |
|
"loss": 0.0004, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 5.538461538461538, |
|
"grad_norm": 0.008516835980117321, |
|
"learning_rate": 9.5512635060551e-05, |
|
"loss": 0.0008, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 5.5465587044534415, |
|
"grad_norm": 0.0046457210555672646, |
|
"learning_rate": 9.548910037538141e-05, |
|
"loss": 0.0003, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 5.554655870445345, |
|
"grad_norm": 0.006752189248800278, |
|
"learning_rate": 9.546550705091842e-05, |
|
"loss": 0.0005, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 5.562753036437247, |
|
"grad_norm": 0.002872879384085536, |
|
"learning_rate": 9.544185511757581e-05, |
|
"loss": 0.0003, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 5.57085020242915, |
|
"grad_norm": 0.009211295284330845, |
|
"learning_rate": 9.541814460584293e-05, |
|
"loss": 0.0005, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 5.578947368421053, |
|
"grad_norm": 0.0036671042907983065, |
|
"learning_rate": 9.539437554628464e-05, |
|
"loss": 0.0004, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 5.587044534412955, |
|
"grad_norm": 0.005291562993079424, |
|
"learning_rate": 9.537054796954123e-05, |
|
"loss": 0.0003, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.5951417004048585, |
|
"grad_norm": 0.0069409445859491825, |
|
"learning_rate": 9.53466619063285e-05, |
|
"loss": 0.0006, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 5.603238866396762, |
|
"grad_norm": 0.004893122706562281, |
|
"learning_rate": 9.53227173874376e-05, |
|
"loss": 0.0003, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 5.611336032388664, |
|
"grad_norm": 0.0035761839244514704, |
|
"learning_rate": 9.529871444373502e-05, |
|
"loss": 0.0004, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 5.619433198380567, |
|
"grad_norm": 0.005408015567809343, |
|
"learning_rate": 9.527465310616259e-05, |
|
"loss": 0.0004, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 5.62753036437247, |
|
"grad_norm": 0.007360650692135096, |
|
"learning_rate": 9.52505334057374e-05, |
|
"loss": 0.0008, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 5.635627530364372, |
|
"grad_norm": 0.004153635818511248, |
|
"learning_rate": 9.522635537355178e-05, |
|
"loss": 0.0004, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 5.6437246963562755, |
|
"grad_norm": 0.004713242873549461, |
|
"learning_rate": 9.520211904077328e-05, |
|
"loss": 0.0005, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 5.651821862348179, |
|
"grad_norm": 0.00541323609650135, |
|
"learning_rate": 9.517782443864455e-05, |
|
"loss": 0.0005, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 5.659919028340081, |
|
"grad_norm": 0.007091245148330927, |
|
"learning_rate": 9.51534715984834e-05, |
|
"loss": 0.0007, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 5.668016194331984, |
|
"grad_norm": 0.004219442140311003, |
|
"learning_rate": 9.512906055168269e-05, |
|
"loss": 0.0005, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.668016194331984, |
|
"eval_loss": 0.0014782178914174438, |
|
"eval_runtime": 20.8959, |
|
"eval_samples_per_second": 4.786, |
|
"eval_steps_per_second": 1.196, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.676113360323887, |
|
"grad_norm": 0.003051872830837965, |
|
"learning_rate": 9.510459132971035e-05, |
|
"loss": 0.0004, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 5.684210526315789, |
|
"grad_norm": 0.0043883356265723705, |
|
"learning_rate": 9.508006396410923e-05, |
|
"loss": 0.0003, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 5.6923076923076925, |
|
"grad_norm": 0.0058163790963590145, |
|
"learning_rate": 9.505547848649721e-05, |
|
"loss": 0.0005, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 5.700404858299595, |
|
"grad_norm": 0.00913459062576294, |
|
"learning_rate": 9.503083492856704e-05, |
|
"loss": 0.0008, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 5.708502024291498, |
|
"grad_norm": 0.007107607554644346, |
|
"learning_rate": 9.500613332208634e-05, |
|
"loss": 0.0006, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 5.716599190283401, |
|
"grad_norm": 0.002083905041217804, |
|
"learning_rate": 9.498137369889757e-05, |
|
"loss": 0.0002, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 5.724696356275303, |
|
"grad_norm": 0.0058279503136873245, |
|
"learning_rate": 9.495655609091799e-05, |
|
"loss": 0.0005, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 5.732793522267206, |
|
"grad_norm": 0.002947121160104871, |
|
"learning_rate": 9.493168053013957e-05, |
|
"loss": 0.0003, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 5.7408906882591095, |
|
"grad_norm": 0.0052326153963804245, |
|
"learning_rate": 9.490674704862901e-05, |
|
"loss": 0.0003, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 5.748987854251012, |
|
"grad_norm": 0.0074744438752532005, |
|
"learning_rate": 9.48817556785277e-05, |
|
"loss": 0.0006, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.757085020242915, |
|
"grad_norm": 0.007500396575778723, |
|
"learning_rate": 9.485670645205163e-05, |
|
"loss": 0.0004, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 5.765182186234818, |
|
"grad_norm": 0.006873182021081448, |
|
"learning_rate": 9.483159940149132e-05, |
|
"loss": 0.0006, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 5.77327935222672, |
|
"grad_norm": 0.007117138244211674, |
|
"learning_rate": 9.480643455921194e-05, |
|
"loss": 0.0005, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 5.781376518218623, |
|
"grad_norm": 0.0046636732295155525, |
|
"learning_rate": 9.478121195765303e-05, |
|
"loss": 0.0004, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 5.7894736842105265, |
|
"grad_norm": 0.006344164256006479, |
|
"learning_rate": 9.475593162932872e-05, |
|
"loss": 0.0006, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 5.797570850202429, |
|
"grad_norm": 0.007509822491556406, |
|
"learning_rate": 9.473059360682747e-05, |
|
"loss": 0.0004, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 5.805668016194332, |
|
"grad_norm": 0.0036457066889852285, |
|
"learning_rate": 9.47051979228121e-05, |
|
"loss": 0.0004, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 5.813765182186235, |
|
"grad_norm": 0.0034911236725747585, |
|
"learning_rate": 9.467974461001982e-05, |
|
"loss": 0.0003, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 5.821862348178137, |
|
"grad_norm": 0.006103496067225933, |
|
"learning_rate": 9.465423370126212e-05, |
|
"loss": 0.0005, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 5.82995951417004, |
|
"grad_norm": 0.0062059275805950165, |
|
"learning_rate": 9.462866522942468e-05, |
|
"loss": 0.0006, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.838056680161944, |
|
"grad_norm": 0.003860118566080928, |
|
"learning_rate": 9.460303922746743e-05, |
|
"loss": 0.0003, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 5.846153846153846, |
|
"grad_norm": 0.008872183039784431, |
|
"learning_rate": 9.457735572842445e-05, |
|
"loss": 0.0005, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 5.854251012145749, |
|
"grad_norm": 0.008847801014780998, |
|
"learning_rate": 9.455161476540394e-05, |
|
"loss": 0.0005, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 5.862348178137652, |
|
"grad_norm": 0.0064693475142121315, |
|
"learning_rate": 9.452581637158819e-05, |
|
"loss": 0.0004, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 5.870445344129554, |
|
"grad_norm": 0.003565672319382429, |
|
"learning_rate": 9.44999605802335e-05, |
|
"loss": 0.0003, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 5.870445344129554, |
|
"eval_loss": 0.0015405402518808842, |
|
"eval_runtime": 20.8981, |
|
"eval_samples_per_second": 4.785, |
|
"eval_steps_per_second": 1.196, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 5.8785425101214575, |
|
"grad_norm": 0.006012369878590107, |
|
"learning_rate": 9.447404742467017e-05, |
|
"loss": 0.0005, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 5.886639676113361, |
|
"grad_norm": 0.01144502218812704, |
|
"learning_rate": 9.444807693830244e-05, |
|
"loss": 0.0006, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 5.894736842105263, |
|
"grad_norm": 0.003242105944082141, |
|
"learning_rate": 9.442204915460847e-05, |
|
"loss": 0.0003, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 5.902834008097166, |
|
"grad_norm": 0.005479468032717705, |
|
"learning_rate": 9.439596410714027e-05, |
|
"loss": 0.0006, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 5.910931174089069, |
|
"grad_norm": 0.007425523828715086, |
|
"learning_rate": 9.436982182952367e-05, |
|
"loss": 0.0006, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.919028340080971, |
|
"grad_norm": 0.005463286302983761, |
|
"learning_rate": 9.434362235545827e-05, |
|
"loss": 0.0006, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 5.9271255060728745, |
|
"grad_norm": 0.0037547284737229347, |
|
"learning_rate": 9.431736571871741e-05, |
|
"loss": 0.0004, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 5.935222672064778, |
|
"grad_norm": 0.0055890390649437904, |
|
"learning_rate": 9.429105195314812e-05, |
|
"loss": 0.0003, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 5.94331983805668, |
|
"grad_norm": 0.0038566221483051777, |
|
"learning_rate": 9.426468109267104e-05, |
|
"loss": 0.0002, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 5.951417004048583, |
|
"grad_norm": 0.004965242929756641, |
|
"learning_rate": 9.423825317128045e-05, |
|
"loss": 0.0005, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 5.959514170040486, |
|
"grad_norm": 0.0034646911080926657, |
|
"learning_rate": 9.42117682230442e-05, |
|
"loss": 0.0004, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 5.967611336032388, |
|
"grad_norm": 0.010685238055884838, |
|
"learning_rate": 9.41852262821036e-05, |
|
"loss": 0.0006, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 5.9757085020242915, |
|
"grad_norm": 0.009992929175496101, |
|
"learning_rate": 9.415862738267347e-05, |
|
"loss": 0.0006, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 5.983805668016195, |
|
"grad_norm": 0.0034835604019463062, |
|
"learning_rate": 9.413197155904201e-05, |
|
"loss": 0.0004, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 5.991902834008097, |
|
"grad_norm": 0.006391593255102634, |
|
"learning_rate": 9.410525884557084e-05, |
|
"loss": 0.0006, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.012005936354398727, |
|
"learning_rate": 9.407848927669494e-05, |
|
"loss": 0.0007, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 6.008097165991903, |
|
"grad_norm": 0.005054526962339878, |
|
"learning_rate": 9.405166288692249e-05, |
|
"loss": 0.0003, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 6.016194331983805, |
|
"grad_norm": 0.0041589937172830105, |
|
"learning_rate": 9.402477971083501e-05, |
|
"loss": 0.0004, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 6.0242914979757085, |
|
"grad_norm": 0.004005232825875282, |
|
"learning_rate": 9.399783978308716e-05, |
|
"loss": 0.0003, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 6.032388663967612, |
|
"grad_norm": 0.0026451381854712963, |
|
"learning_rate": 9.39708431384068e-05, |
|
"loss": 0.0003, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 6.040485829959514, |
|
"grad_norm": 0.0035752663388848305, |
|
"learning_rate": 9.39437898115949e-05, |
|
"loss": 0.0003, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 6.048582995951417, |
|
"grad_norm": 0.00357494642958045, |
|
"learning_rate": 9.391667983752545e-05, |
|
"loss": 0.0004, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 6.05668016194332, |
|
"grad_norm": 0.006460043601691723, |
|
"learning_rate": 9.388951325114552e-05, |
|
"loss": 0.0006, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 6.064777327935222, |
|
"grad_norm": 0.004135911352932453, |
|
"learning_rate": 9.386229008747514e-05, |
|
"loss": 0.0003, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 6.0728744939271255, |
|
"grad_norm": 0.0059121702797710896, |
|
"learning_rate": 9.383501038160725e-05, |
|
"loss": 0.0006, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.0728744939271255, |
|
"eval_loss": 0.0015712064923718572, |
|
"eval_runtime": 20.9013, |
|
"eval_samples_per_second": 4.784, |
|
"eval_steps_per_second": 1.196, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.080971659919029, |
|
"grad_norm": 0.0053249807097017765, |
|
"learning_rate": 9.380767416870768e-05, |
|
"loss": 0.0004, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 6.089068825910931, |
|
"grad_norm": 0.0062401313334703445, |
|
"learning_rate": 9.378028148401516e-05, |
|
"loss": 0.0005, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 6.097165991902834, |
|
"grad_norm": 0.003968521486967802, |
|
"learning_rate": 9.375283236284116e-05, |
|
"loss": 0.0004, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 6.105263157894737, |
|
"grad_norm": 0.0026025085244327784, |
|
"learning_rate": 9.37253268405699e-05, |
|
"loss": 0.0003, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 6.113360323886639, |
|
"grad_norm": 0.004716400057077408, |
|
"learning_rate": 9.369776495265831e-05, |
|
"loss": 0.0003, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 6.1214574898785425, |
|
"grad_norm": 0.006493438966572285, |
|
"learning_rate": 9.367014673463605e-05, |
|
"loss": 0.0003, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 6.129554655870446, |
|
"grad_norm": 0.004002865869551897, |
|
"learning_rate": 9.364247222210529e-05, |
|
"loss": 0.0004, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 6.137651821862348, |
|
"grad_norm": 0.006383996922522783, |
|
"learning_rate": 9.361474145074081e-05, |
|
"loss": 0.0004, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 6.145748987854251, |
|
"grad_norm": 0.008037807419896126, |
|
"learning_rate": 9.358695445628996e-05, |
|
"loss": 0.0004, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.002863495144993067, |
|
"learning_rate": 9.355911127457247e-05, |
|
"loss": 0.0003, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.161943319838056, |
|
"grad_norm": 0.0033732058946043253, |
|
"learning_rate": 9.353121194148058e-05, |
|
"loss": 0.0003, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 6.17004048582996, |
|
"grad_norm": 0.00469600223004818, |
|
"learning_rate": 9.35032564929789e-05, |
|
"loss": 0.0003, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 6.178137651821863, |
|
"grad_norm": 0.0032547153532505035, |
|
"learning_rate": 9.347524496510436e-05, |
|
"loss": 0.0003, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 6.186234817813765, |
|
"grad_norm": 0.003363592317327857, |
|
"learning_rate": 9.344717739396616e-05, |
|
"loss": 0.0003, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 6.194331983805668, |
|
"grad_norm": 0.006892678793519735, |
|
"learning_rate": 9.341905381574579e-05, |
|
"loss": 0.0004, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 6.202429149797571, |
|
"grad_norm": 0.003087045392021537, |
|
"learning_rate": 9.339087426669692e-05, |
|
"loss": 0.0003, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 6.2105263157894735, |
|
"grad_norm": 0.004086201544851065, |
|
"learning_rate": 9.336263878314536e-05, |
|
"loss": 0.0003, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 6.218623481781377, |
|
"grad_norm": 0.007387399207800627, |
|
"learning_rate": 9.333434740148904e-05, |
|
"loss": 0.0004, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 6.22672064777328, |
|
"grad_norm": 0.008816695772111416, |
|
"learning_rate": 9.330600015819795e-05, |
|
"loss": 0.0003, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 6.234817813765182, |
|
"grad_norm": 0.005616582930088043, |
|
"learning_rate": 9.327759708981406e-05, |
|
"loss": 0.0005, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 6.242914979757085, |
|
"grad_norm": 0.0017385140527039766, |
|
"learning_rate": 9.324913823295133e-05, |
|
"loss": 0.0002, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 6.251012145748988, |
|
"grad_norm": 0.005367297679185867, |
|
"learning_rate": 9.322062362429564e-05, |
|
"loss": 0.0003, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 6.2591093117408905, |
|
"grad_norm": 0.00460553914308548, |
|
"learning_rate": 9.319205330060475e-05, |
|
"loss": 0.0003, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 6.267206477732794, |
|
"grad_norm": 0.007978829555213451, |
|
"learning_rate": 9.316342729870818e-05, |
|
"loss": 0.0007, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 6.275303643724697, |
|
"grad_norm": 0.0032822596840560436, |
|
"learning_rate": 9.313474565550729e-05, |
|
"loss": 0.0003, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 6.275303643724697, |
|
"eval_loss": 0.001532125286757946, |
|
"eval_runtime": 20.8909, |
|
"eval_samples_per_second": 4.787, |
|
"eval_steps_per_second": 1.197, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 6.283400809716599, |
|
"grad_norm": 0.00468923756852746, |
|
"learning_rate": 9.310600840797512e-05, |
|
"loss": 0.0003, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 6.291497975708502, |
|
"grad_norm": 0.003942742943763733, |
|
"learning_rate": 9.307721559315644e-05, |
|
"loss": 0.0003, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 6.299595141700405, |
|
"grad_norm": 0.0031825690530240536, |
|
"learning_rate": 9.304836724816758e-05, |
|
"loss": 0.0003, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 6.3076923076923075, |
|
"grad_norm": 0.014810767956078053, |
|
"learning_rate": 9.301946341019653e-05, |
|
"loss": 0.0005, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 6.315789473684211, |
|
"grad_norm": 0.0029204150196164846, |
|
"learning_rate": 9.299050411650276e-05, |
|
"loss": 0.0003, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 6.323886639676114, |
|
"grad_norm": 0.005694466643035412, |
|
"learning_rate": 9.296148940441727e-05, |
|
"loss": 0.0003, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 6.331983805668016, |
|
"grad_norm": 0.004693406168371439, |
|
"learning_rate": 9.293241931134244e-05, |
|
"loss": 0.0003, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 6.340080971659919, |
|
"grad_norm": 0.005331242457032204, |
|
"learning_rate": 9.290329387475212e-05, |
|
"loss": 0.0003, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 6.348178137651822, |
|
"grad_norm": 0.005269868765026331, |
|
"learning_rate": 9.28741131321914e-05, |
|
"loss": 0.0003, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 6.3562753036437245, |
|
"grad_norm": 0.008096984587609768, |
|
"learning_rate": 9.284487712127677e-05, |
|
"loss": 0.0003, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 6.364372469635628, |
|
"grad_norm": 0.005979133769869804, |
|
"learning_rate": 9.281558587969591e-05, |
|
"loss": 0.0003, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 6.372469635627531, |
|
"grad_norm": 0.005570698995143175, |
|
"learning_rate": 9.27862394452077e-05, |
|
"loss": 0.0003, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 6.380566801619433, |
|
"grad_norm": 0.0197993665933609, |
|
"learning_rate": 9.275683785564216e-05, |
|
"loss": 0.0004, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 6.388663967611336, |
|
"grad_norm": 0.0068647717125713825, |
|
"learning_rate": 9.272738114890043e-05, |
|
"loss": 0.0004, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 6.396761133603239, |
|
"grad_norm": 0.005690258927643299, |
|
"learning_rate": 9.269786936295471e-05, |
|
"loss": 0.0003, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 6.4048582995951415, |
|
"grad_norm": 0.003921832423657179, |
|
"learning_rate": 9.266830253584815e-05, |
|
"loss": 0.0004, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 6.412955465587045, |
|
"grad_norm": 0.007321410812437534, |
|
"learning_rate": 9.263868070569494e-05, |
|
"loss": 0.0005, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 6.421052631578947, |
|
"grad_norm": 0.006045639980584383, |
|
"learning_rate": 9.260900391068008e-05, |
|
"loss": 0.0004, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 6.42914979757085, |
|
"grad_norm": 0.006704007275402546, |
|
"learning_rate": 9.257927218905947e-05, |
|
"loss": 0.0005, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 6.437246963562753, |
|
"grad_norm": 0.005516419652849436, |
|
"learning_rate": 9.254948557915983e-05, |
|
"loss": 0.0005, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 6.445344129554655, |
|
"grad_norm": 0.007129244972020388, |
|
"learning_rate": 9.25196441193786e-05, |
|
"loss": 0.0004, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 6.4534412955465585, |
|
"grad_norm": 0.006566017400473356, |
|
"learning_rate": 9.248974784818396e-05, |
|
"loss": 0.0006, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 6.461538461538462, |
|
"grad_norm": 0.004069427493959665, |
|
"learning_rate": 9.245979680411469e-05, |
|
"loss": 0.0003, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 6.469635627530364, |
|
"grad_norm": 0.00994227733463049, |
|
"learning_rate": 9.242979102578027e-05, |
|
"loss": 0.0005, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 6.477732793522267, |
|
"grad_norm": 0.009190657176077366, |
|
"learning_rate": 9.239973055186066e-05, |
|
"loss": 0.0003, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.477732793522267, |
|
"eval_loss": 0.001472490606829524, |
|
"eval_runtime": 20.8842, |
|
"eval_samples_per_second": 4.788, |
|
"eval_steps_per_second": 1.197, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.48582995951417, |
|
"grad_norm": 0.004918240942060947, |
|
"learning_rate": 9.236961542110634e-05, |
|
"loss": 0.0003, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 6.493927125506072, |
|
"grad_norm": 0.0071716029196977615, |
|
"learning_rate": 9.233944567233825e-05, |
|
"loss": 0.0004, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 6.502024291497976, |
|
"grad_norm": 0.006958463694900274, |
|
"learning_rate": 9.230922134444779e-05, |
|
"loss": 0.0006, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 6.510121457489879, |
|
"grad_norm": 0.004981396719813347, |
|
"learning_rate": 9.227894247639661e-05, |
|
"loss": 0.0005, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 6.518218623481781, |
|
"grad_norm": 0.0036478445399552584, |
|
"learning_rate": 9.224860910721679e-05, |
|
"loss": 0.0003, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 6.526315789473684, |
|
"grad_norm": 0.0037034437991678715, |
|
"learning_rate": 9.221822127601057e-05, |
|
"loss": 0.0003, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 6.534412955465587, |
|
"grad_norm": 0.008089096285402775, |
|
"learning_rate": 9.218777902195043e-05, |
|
"loss": 0.0006, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 6.5425101214574894, |
|
"grad_norm": 0.0034797810949385166, |
|
"learning_rate": 9.215728238427901e-05, |
|
"loss": 0.0004, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 6.550607287449393, |
|
"grad_norm": 0.011531390249729156, |
|
"learning_rate": 9.212673140230907e-05, |
|
"loss": 0.0006, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 6.558704453441296, |
|
"grad_norm": 0.0037022933829575777, |
|
"learning_rate": 9.20961261154234e-05, |
|
"loss": 0.0003, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 6.566801619433198, |
|
"grad_norm": 0.0049828048795461655, |
|
"learning_rate": 9.206546656307478e-05, |
|
"loss": 0.0003, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 6.574898785425101, |
|
"grad_norm": 0.004933022893965244, |
|
"learning_rate": 9.2034752784786e-05, |
|
"loss": 0.0004, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 6.582995951417004, |
|
"grad_norm": 0.0033220879267901182, |
|
"learning_rate": 9.200398482014967e-05, |
|
"loss": 0.0003, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 6.5910931174089065, |
|
"grad_norm": 0.0034737708047032356, |
|
"learning_rate": 9.197316270882833e-05, |
|
"loss": 0.0003, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 6.59919028340081, |
|
"grad_norm": 0.004562221933156252, |
|
"learning_rate": 9.194228649055427e-05, |
|
"loss": 0.0005, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 6.607287449392713, |
|
"grad_norm": 0.004385192412883043, |
|
"learning_rate": 9.191135620512956e-05, |
|
"loss": 0.0005, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 6.615384615384615, |
|
"grad_norm": 0.002965794876217842, |
|
"learning_rate": 9.188037189242593e-05, |
|
"loss": 0.0003, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 6.623481781376518, |
|
"grad_norm": 0.0037224399857223034, |
|
"learning_rate": 9.184933359238479e-05, |
|
"loss": 0.0004, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 6.631578947368421, |
|
"grad_norm": 0.00610083993524313, |
|
"learning_rate": 9.181824134501711e-05, |
|
"loss": 0.0003, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 6.6396761133603235, |
|
"grad_norm": 0.004559030756354332, |
|
"learning_rate": 9.178709519040347e-05, |
|
"loss": 0.0003, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.647773279352227, |
|
"grad_norm": 0.004950647708028555, |
|
"learning_rate": 9.175589516869386e-05, |
|
"loss": 0.0003, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 6.65587044534413, |
|
"grad_norm": 0.0062233456410467625, |
|
"learning_rate": 9.172464132010773e-05, |
|
"loss": 0.0006, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 6.663967611336032, |
|
"grad_norm": 0.005544841755181551, |
|
"learning_rate": 9.169333368493396e-05, |
|
"loss": 0.0004, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 6.672064777327935, |
|
"grad_norm": 0.005783478729426861, |
|
"learning_rate": 9.166197230353073e-05, |
|
"loss": 0.0003, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 6.680161943319838, |
|
"grad_norm": 0.004205208271741867, |
|
"learning_rate": 9.163055721632549e-05, |
|
"loss": 0.0003, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 6.680161943319838, |
|
"eval_loss": 0.0014739898033440113, |
|
"eval_runtime": 20.8491, |
|
"eval_samples_per_second": 4.796, |
|
"eval_steps_per_second": 1.199, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 6.6882591093117405, |
|
"grad_norm": 0.002174858469516039, |
|
"learning_rate": 9.159908846381498e-05, |
|
"loss": 0.0002, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 6.696356275303644, |
|
"grad_norm": 0.007229868322610855, |
|
"learning_rate": 9.156756608656506e-05, |
|
"loss": 0.0004, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 6.704453441295547, |
|
"grad_norm": 0.00587072828784585, |
|
"learning_rate": 9.153599012521073e-05, |
|
"loss": 0.0003, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 6.712550607287449, |
|
"grad_norm": 0.00896474625915289, |
|
"learning_rate": 9.150436062045607e-05, |
|
"loss": 0.0003, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 6.720647773279352, |
|
"grad_norm": 0.005071448162198067, |
|
"learning_rate": 9.147267761307421e-05, |
|
"loss": 0.0003, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.728744939271255, |
|
"grad_norm": 0.005802792031317949, |
|
"learning_rate": 9.144094114390718e-05, |
|
"loss": 0.0003, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 6.7368421052631575, |
|
"grad_norm": 0.0023914834018796682, |
|
"learning_rate": 9.140915125386602e-05, |
|
"loss": 0.0003, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 6.744939271255061, |
|
"grad_norm": 0.004418396390974522, |
|
"learning_rate": 9.137730798393054e-05, |
|
"loss": 0.0003, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 6.753036437246964, |
|
"grad_norm": 0.006196359172463417, |
|
"learning_rate": 9.134541137514945e-05, |
|
"loss": 0.0003, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 6.761133603238866, |
|
"grad_norm": 0.0021507740020751953, |
|
"learning_rate": 9.131346146864013e-05, |
|
"loss": 0.0002, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 6.769230769230769, |
|
"grad_norm": 0.003959763795137405, |
|
"learning_rate": 9.128145830558872e-05, |
|
"loss": 0.0003, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 6.777327935222672, |
|
"grad_norm": 0.0037787449546158314, |
|
"learning_rate": 9.124940192725002e-05, |
|
"loss": 0.0003, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 6.7854251012145745, |
|
"grad_norm": 0.008174480870366096, |
|
"learning_rate": 9.121729237494738e-05, |
|
"loss": 0.0006, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 6.793522267206478, |
|
"grad_norm": 0.004929924383759499, |
|
"learning_rate": 9.118512969007276e-05, |
|
"loss": 0.0004, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 6.801619433198381, |
|
"grad_norm": 0.004881354980170727, |
|
"learning_rate": 9.115291391408656e-05, |
|
"loss": 0.0004, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.809716599190283, |
|
"grad_norm": 0.004849132616072893, |
|
"learning_rate": 9.112064508851763e-05, |
|
"loss": 0.0003, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 6.817813765182186, |
|
"grad_norm": 0.007561651989817619, |
|
"learning_rate": 9.108832325496322e-05, |
|
"loss": 0.0004, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 6.825910931174089, |
|
"grad_norm": 0.005616433918476105, |
|
"learning_rate": 9.105594845508891e-05, |
|
"loss": 0.0003, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 6.834008097165992, |
|
"grad_norm": 0.004294142127037048, |
|
"learning_rate": 9.102352073062854e-05, |
|
"loss": 0.0004, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 6.842105263157895, |
|
"grad_norm": 0.0037746201269328594, |
|
"learning_rate": 9.09910401233842e-05, |
|
"loss": 0.0003, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 6.850202429149798, |
|
"grad_norm": 0.008090085349977016, |
|
"learning_rate": 9.095850667522611e-05, |
|
"loss": 0.001, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 6.8582995951417, |
|
"grad_norm": 0.006233837455511093, |
|
"learning_rate": 9.092592042809267e-05, |
|
"loss": 0.0004, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 6.866396761133603, |
|
"grad_norm": 0.01793971285223961, |
|
"learning_rate": 9.08932814239903e-05, |
|
"loss": 0.0004, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 6.874493927125506, |
|
"grad_norm": 0.008522222749888897, |
|
"learning_rate": 9.086058970499341e-05, |
|
"loss": 0.0003, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 6.882591093117409, |
|
"grad_norm": 0.004106331150978804, |
|
"learning_rate": 9.082784531324437e-05, |
|
"loss": 0.0004, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.882591093117409, |
|
"eval_loss": 0.0014429772272706032, |
|
"eval_runtime": 20.9236, |
|
"eval_samples_per_second": 4.779, |
|
"eval_steps_per_second": 1.195, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.890688259109312, |
|
"grad_norm": 0.005821602884680033, |
|
"learning_rate": 9.079504829095354e-05, |
|
"loss": 0.0004, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 6.898785425101215, |
|
"grad_norm": 0.0054057384841144085, |
|
"learning_rate": 9.076219868039899e-05, |
|
"loss": 0.0004, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 6.906882591093117, |
|
"grad_norm": 0.0044207195751369, |
|
"learning_rate": 9.072929652392666e-05, |
|
"loss": 0.0004, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 6.91497975708502, |
|
"grad_norm": 0.005526500288397074, |
|
"learning_rate": 9.069634186395022e-05, |
|
"loss": 0.0004, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 6.923076923076923, |
|
"grad_norm": 0.003987747244536877, |
|
"learning_rate": 9.066333474295099e-05, |
|
"loss": 0.0003, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 6.931174089068826, |
|
"grad_norm": 0.006826246622949839, |
|
"learning_rate": 9.063027520347796e-05, |
|
"loss": 0.0004, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 6.939271255060729, |
|
"grad_norm": 0.008676338940858841, |
|
"learning_rate": 9.059716328814765e-05, |
|
"loss": 0.0008, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 6.947368421052632, |
|
"grad_norm": 0.006324645131826401, |
|
"learning_rate": 9.056399903964414e-05, |
|
"loss": 0.0003, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 6.955465587044534, |
|
"grad_norm": 0.008029861375689507, |
|
"learning_rate": 9.053078250071891e-05, |
|
"loss": 0.0005, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 6.963562753036437, |
|
"grad_norm": 0.008121266961097717, |
|
"learning_rate": 9.049751371419093e-05, |
|
"loss": 0.0006, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.97165991902834, |
|
"grad_norm": 0.0037181430961936712, |
|
"learning_rate": 9.046419272294644e-05, |
|
"loss": 0.0004, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 6.979757085020243, |
|
"grad_norm": 0.003322604577988386, |
|
"learning_rate": 9.043081956993904e-05, |
|
"loss": 0.0003, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 6.987854251012146, |
|
"grad_norm": 0.003378738649189472, |
|
"learning_rate": 9.039739429818953e-05, |
|
"loss": 0.0003, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 6.995951417004049, |
|
"grad_norm": 0.0044447388499975204, |
|
"learning_rate": 9.036391695078589e-05, |
|
"loss": 0.0004, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 7.004048582995951, |
|
"grad_norm": 0.009173744358122349, |
|
"learning_rate": 9.03303875708833e-05, |
|
"loss": 0.0006, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 7.012145748987854, |
|
"grad_norm": 0.005636727903038263, |
|
"learning_rate": 9.029680620170392e-05, |
|
"loss": 0.0003, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 7.020242914979757, |
|
"grad_norm": 0.007775536272674799, |
|
"learning_rate": 9.026317288653698e-05, |
|
"loss": 0.0003, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 7.02834008097166, |
|
"grad_norm": 0.002120030578225851, |
|
"learning_rate": 9.022948766873868e-05, |
|
"loss": 0.0002, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 7.036437246963563, |
|
"grad_norm": 0.0061560156755149364, |
|
"learning_rate": 9.019575059173209e-05, |
|
"loss": 0.0003, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 7.044534412955466, |
|
"grad_norm": 0.003942039795219898, |
|
"learning_rate": 9.016196169900717e-05, |
|
"loss": 0.0003, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 7.052631578947368, |
|
"grad_norm": 0.005075725261121988, |
|
"learning_rate": 9.012812103412065e-05, |
|
"loss": 0.0003, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 7.060728744939271, |
|
"grad_norm": 0.008946137502789497, |
|
"learning_rate": 9.0094228640696e-05, |
|
"loss": 0.0008, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 7.068825910931174, |
|
"grad_norm": 0.005815677810460329, |
|
"learning_rate": 9.006028456242339e-05, |
|
"loss": 0.0004, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 7.076923076923077, |
|
"grad_norm": 0.0050154379568994045, |
|
"learning_rate": 9.002628884305959e-05, |
|
"loss": 0.0004, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 7.08502024291498, |
|
"grad_norm": 0.00504196947440505, |
|
"learning_rate": 8.999224152642798e-05, |
|
"loss": 0.0003, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 7.08502024291498, |
|
"eval_loss": 0.0014816973125562072, |
|
"eval_runtime": 20.8403, |
|
"eval_samples_per_second": 4.798, |
|
"eval_steps_per_second": 1.2, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 7.093117408906883, |
|
"grad_norm": 0.0035983005072921515, |
|
"learning_rate": 8.995814265641841e-05, |
|
"loss": 0.0004, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 7.101214574898785, |
|
"grad_norm": 0.004920099396258593, |
|
"learning_rate": 8.992399227698721e-05, |
|
"loss": 0.0004, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 7.109311740890688, |
|
"grad_norm": 0.007466362789273262, |
|
"learning_rate": 8.988979043215708e-05, |
|
"loss": 0.0004, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 7.117408906882591, |
|
"grad_norm": 0.0035266538616269827, |
|
"learning_rate": 8.985553716601711e-05, |
|
"loss": 0.0003, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 7.125506072874494, |
|
"grad_norm": 0.0027432111091911793, |
|
"learning_rate": 8.982123252272265e-05, |
|
"loss": 0.0003, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 7.133603238866397, |
|
"grad_norm": 0.005367937497794628, |
|
"learning_rate": 8.97868765464953e-05, |
|
"loss": 0.0004, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 7.1417004048583, |
|
"grad_norm": 0.00508796377107501, |
|
"learning_rate": 8.97524692816228e-05, |
|
"loss": 0.0004, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 7.149797570850202, |
|
"grad_norm": 0.005310658365488052, |
|
"learning_rate": 8.9718010772459e-05, |
|
"loss": 0.0004, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 7.157894736842105, |
|
"grad_norm": 0.003940463997423649, |
|
"learning_rate": 8.968350106342387e-05, |
|
"loss": 0.0003, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 7.165991902834008, |
|
"grad_norm": 0.0020052019972354174, |
|
"learning_rate": 8.964894019900332e-05, |
|
"loss": 0.0002, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 7.174089068825911, |
|
"grad_norm": 0.0013627801090478897, |
|
"learning_rate": 8.961432822374922e-05, |
|
"loss": 0.0002, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 7.182186234817814, |
|
"grad_norm": 0.004774102475494146, |
|
"learning_rate": 8.957966518227934e-05, |
|
"loss": 0.0005, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 7.190283400809717, |
|
"grad_norm": 0.004429248161613941, |
|
"learning_rate": 8.954495111927726e-05, |
|
"loss": 0.0004, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 7.198380566801619, |
|
"grad_norm": 0.0024098637513816357, |
|
"learning_rate": 8.951018607949232e-05, |
|
"loss": 0.0002, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 7.206477732793522, |
|
"grad_norm": 0.004957470111548901, |
|
"learning_rate": 8.947537010773966e-05, |
|
"loss": 0.0004, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 7.2145748987854255, |
|
"grad_norm": 0.0025283123832195997, |
|
"learning_rate": 8.944050324889995e-05, |
|
"loss": 0.0003, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 7.222672064777328, |
|
"grad_norm": 0.00853675790131092, |
|
"learning_rate": 8.940558554791952e-05, |
|
"loss": 0.0003, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 7.230769230769231, |
|
"grad_norm": 0.004788931459188461, |
|
"learning_rate": 8.937061704981026e-05, |
|
"loss": 0.0003, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 7.238866396761134, |
|
"grad_norm": 0.007903149351477623, |
|
"learning_rate": 8.933559779964951e-05, |
|
"loss": 0.0003, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 7.246963562753036, |
|
"grad_norm": 0.01020093634724617, |
|
"learning_rate": 8.930052784258004e-05, |
|
"loss": 0.0003, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 7.255060728744939, |
|
"grad_norm": 0.0071566407568752766, |
|
"learning_rate": 8.926540722380999e-05, |
|
"loss": 0.0003, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 7.2631578947368425, |
|
"grad_norm": 0.011600234545767307, |
|
"learning_rate": 8.92302359886128e-05, |
|
"loss": 0.0006, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 7.271255060728745, |
|
"grad_norm": 0.0032473020255565643, |
|
"learning_rate": 8.919501418232716e-05, |
|
"loss": 0.0003, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 7.279352226720648, |
|
"grad_norm": 0.003534214338287711, |
|
"learning_rate": 8.915974185035696e-05, |
|
"loss": 0.0003, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 7.287449392712551, |
|
"grad_norm": 0.00417256960645318, |
|
"learning_rate": 8.912441903817122e-05, |
|
"loss": 0.0004, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 7.287449392712551, |
|
"eval_loss": 0.001444089226424694, |
|
"eval_runtime": 20.861, |
|
"eval_samples_per_second": 4.794, |
|
"eval_steps_per_second": 1.198, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 7.295546558704453, |
|
"grad_norm": 0.004150230437517166, |
|
"learning_rate": 8.908904579130403e-05, |
|
"loss": 0.0005, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 7.303643724696356, |
|
"grad_norm": 0.0044599175453186035, |
|
"learning_rate": 8.905362215535447e-05, |
|
"loss": 0.0003, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 7.3117408906882595, |
|
"grad_norm": 0.004847770091146231, |
|
"learning_rate": 8.901814817598664e-05, |
|
"loss": 0.0003, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 7.319838056680162, |
|
"grad_norm": 0.006142038386315107, |
|
"learning_rate": 8.898262389892946e-05, |
|
"loss": 0.0003, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 7.327935222672065, |
|
"grad_norm": 0.004584239795804024, |
|
"learning_rate": 8.894704936997674e-05, |
|
"loss": 0.0003, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 7.336032388663968, |
|
"grad_norm": 0.00513102114200592, |
|
"learning_rate": 8.891142463498705e-05, |
|
"loss": 0.0005, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 7.34412955465587, |
|
"grad_norm": 0.003908930346369743, |
|
"learning_rate": 8.887574973988368e-05, |
|
"loss": 0.0003, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 7.352226720647773, |
|
"grad_norm": 0.003959581255912781, |
|
"learning_rate": 8.884002473065459e-05, |
|
"loss": 0.0002, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 7.3603238866396765, |
|
"grad_norm": 0.005572907626628876, |
|
"learning_rate": 8.880424965335234e-05, |
|
"loss": 0.0003, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 7.368421052631579, |
|
"grad_norm": 0.005206345114856958, |
|
"learning_rate": 8.8768424554094e-05, |
|
"loss": 0.0005, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 7.376518218623482, |
|
"grad_norm": 0.003059947630390525, |
|
"learning_rate": 8.87325494790612e-05, |
|
"loss": 0.0002, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 7.384615384615385, |
|
"grad_norm": 0.0039919642731547356, |
|
"learning_rate": 8.86966244744999e-05, |
|
"loss": 0.0003, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 7.392712550607287, |
|
"grad_norm": 0.008040755987167358, |
|
"learning_rate": 8.866064958672047e-05, |
|
"loss": 0.0004, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 7.40080971659919, |
|
"grad_norm": 0.003773482283577323, |
|
"learning_rate": 8.862462486209758e-05, |
|
"loss": 0.0003, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 7.4089068825910935, |
|
"grad_norm": 0.003051398554816842, |
|
"learning_rate": 8.858855034707016e-05, |
|
"loss": 0.0002, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 7.417004048582996, |
|
"grad_norm": 0.012867518700659275, |
|
"learning_rate": 8.855242608814132e-05, |
|
"loss": 0.0003, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 7.425101214574899, |
|
"grad_norm": 0.008691791445016861, |
|
"learning_rate": 8.851625213187823e-05, |
|
"loss": 0.0003, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 7.433198380566802, |
|
"grad_norm": 0.0036156801506876945, |
|
"learning_rate": 8.848002852491222e-05, |
|
"loss": 0.0003, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 7.441295546558704, |
|
"grad_norm": 0.00877736322581768, |
|
"learning_rate": 8.844375531393856e-05, |
|
"loss": 0.0005, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 7.449392712550607, |
|
"grad_norm": 0.006146470084786415, |
|
"learning_rate": 8.840743254571648e-05, |
|
"loss": 0.0003, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 7.4574898785425106, |
|
"grad_norm": 0.005941275041550398, |
|
"learning_rate": 8.837106026706911e-05, |
|
"loss": 0.0003, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 7.465587044534413, |
|
"grad_norm": 0.006193244829773903, |
|
"learning_rate": 8.83346385248834e-05, |
|
"loss": 0.0003, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 7.473684210526316, |
|
"grad_norm": 0.005591843742877245, |
|
"learning_rate": 8.829816736611003e-05, |
|
"loss": 0.0004, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 7.481781376518219, |
|
"grad_norm": 0.004145904444158077, |
|
"learning_rate": 8.82616468377634e-05, |
|
"loss": 0.0002, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 7.489878542510121, |
|
"grad_norm": 0.00641997903585434, |
|
"learning_rate": 8.82250769869216e-05, |
|
"loss": 0.0004, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 7.489878542510121, |
|
"eval_loss": 0.0014737433521077037, |
|
"eval_runtime": 20.8683, |
|
"eval_samples_per_second": 4.792, |
|
"eval_steps_per_second": 1.198, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 7.497975708502024, |
|
"grad_norm": 0.0023201555013656616, |
|
"learning_rate": 8.81884578607262e-05, |
|
"loss": 0.0003, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 7.506072874493928, |
|
"grad_norm": 0.004539423622190952, |
|
"learning_rate": 8.815178950638239e-05, |
|
"loss": 0.0003, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 7.51417004048583, |
|
"grad_norm": 0.005551203154027462, |
|
"learning_rate": 8.811507197115876e-05, |
|
"loss": 0.0003, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 7.522267206477733, |
|
"grad_norm": 0.0017370175337418914, |
|
"learning_rate": 8.80783053023873e-05, |
|
"loss": 0.0002, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 7.530364372469636, |
|
"grad_norm": 0.0032071254681795835, |
|
"learning_rate": 8.804148954746338e-05, |
|
"loss": 0.0003, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 7.538461538461538, |
|
"grad_norm": 0.008781791664659977, |
|
"learning_rate": 8.80046247538456e-05, |
|
"loss": 0.0004, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 7.5465587044534415, |
|
"grad_norm": 0.007886053062975407, |
|
"learning_rate": 8.796771096905581e-05, |
|
"loss": 0.0004, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 7.554655870445345, |
|
"grad_norm": 0.00439440319314599, |
|
"learning_rate": 8.793074824067898e-05, |
|
"loss": 0.0004, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 7.562753036437247, |
|
"grad_norm": 0.006747337989509106, |
|
"learning_rate": 8.789373661636318e-05, |
|
"loss": 0.0004, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 7.57085020242915, |
|
"grad_norm": 0.004344824235886335, |
|
"learning_rate": 8.785667614381956e-05, |
|
"loss": 0.0003, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 7.578947368421053, |
|
"grad_norm": 0.014749690890312195, |
|
"learning_rate": 8.781956687082215e-05, |
|
"loss": 0.0004, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 7.587044534412955, |
|
"grad_norm": 0.006340457126498222, |
|
"learning_rate": 8.778240884520798e-05, |
|
"loss": 0.0006, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 7.5951417004048585, |
|
"grad_norm": 0.0055120596662163734, |
|
"learning_rate": 8.774520211487689e-05, |
|
"loss": 0.0003, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 7.603238866396762, |
|
"grad_norm": 0.0028302748687565327, |
|
"learning_rate": 8.770794672779145e-05, |
|
"loss": 0.0003, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 7.611336032388664, |
|
"grad_norm": 0.006510081235319376, |
|
"learning_rate": 8.767064273197705e-05, |
|
"loss": 0.0002, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 7.619433198380567, |
|
"grad_norm": 0.0060364557430148125, |
|
"learning_rate": 8.763329017552165e-05, |
|
"loss": 0.0004, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 7.62753036437247, |
|
"grad_norm": 0.0069529772736132145, |
|
"learning_rate": 8.759588910657588e-05, |
|
"loss": 0.0005, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 7.635627530364372, |
|
"grad_norm": 0.004733328241854906, |
|
"learning_rate": 8.755843957335287e-05, |
|
"loss": 0.0003, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 7.6437246963562755, |
|
"grad_norm": 0.0061776163056492805, |
|
"learning_rate": 8.752094162412823e-05, |
|
"loss": 0.0004, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 7.651821862348179, |
|
"grad_norm": 0.002511364873498678, |
|
"learning_rate": 8.748339530723999e-05, |
|
"loss": 0.0003, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 7.659919028340081, |
|
"grad_norm": 0.005621533375233412, |
|
"learning_rate": 8.744580067108851e-05, |
|
"loss": 0.0005, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 7.668016194331984, |
|
"grad_norm": 0.004172396846115589, |
|
"learning_rate": 8.740815776413649e-05, |
|
"loss": 0.0002, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 7.676113360323887, |
|
"grad_norm": 0.005457951687276363, |
|
"learning_rate": 8.737046663490877e-05, |
|
"loss": 0.0003, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 7.684210526315789, |
|
"grad_norm": 0.0014876670902594924, |
|
"learning_rate": 8.733272733199241e-05, |
|
"loss": 0.0002, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 0.01848655752837658, |
|
"learning_rate": 8.72949399040366e-05, |
|
"loss": 0.0004, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"eval_loss": 0.0013926097890362144, |
|
"eval_runtime": 20.8744, |
|
"eval_samples_per_second": 4.791, |
|
"eval_steps_per_second": 1.198, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.700404858299595, |
|
"grad_norm": 0.006439445540308952, |
|
"learning_rate": 8.725710439975247e-05, |
|
"loss": 0.0004, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 7.708502024291498, |
|
"grad_norm": 0.004773348104208708, |
|
"learning_rate": 8.721922086791321e-05, |
|
"loss": 0.0003, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 7.716599190283401, |
|
"grad_norm": 0.00517980707809329, |
|
"learning_rate": 8.71812893573539e-05, |
|
"loss": 0.0004, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 7.724696356275303, |
|
"grad_norm": 0.006808164995163679, |
|
"learning_rate": 8.714330991697144e-05, |
|
"loss": 0.0004, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 7.732793522267206, |
|
"grad_norm": 0.0021944716572761536, |
|
"learning_rate": 8.710528259572456e-05, |
|
"loss": 0.0003, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 7.7408906882591095, |
|
"grad_norm": 0.002964154351502657, |
|
"learning_rate": 8.706720744263368e-05, |
|
"loss": 0.0003, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 7.748987854251012, |
|
"grad_norm": 0.007951617240905762, |
|
"learning_rate": 8.702908450678088e-05, |
|
"loss": 0.0005, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 7.757085020242915, |
|
"grad_norm": 0.007497166749089956, |
|
"learning_rate": 8.699091383730987e-05, |
|
"loss": 0.0006, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 7.765182186234818, |
|
"grad_norm": 0.0034041095059365034, |
|
"learning_rate": 8.695269548342584e-05, |
|
"loss": 0.0003, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 7.77327935222672, |
|
"grad_norm": 0.006926527712494135, |
|
"learning_rate": 8.691442949439548e-05, |
|
"loss": 0.0006, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.781376518218623, |
|
"grad_norm": 0.006445242557674646, |
|
"learning_rate": 8.68761159195469e-05, |
|
"loss": 0.0005, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 7.7894736842105265, |
|
"grad_norm": 0.004453368950635195, |
|
"learning_rate": 8.683775480826953e-05, |
|
"loss": 0.0003, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 7.797570850202429, |
|
"grad_norm": 0.005603456404060125, |
|
"learning_rate": 8.679934621001407e-05, |
|
"loss": 0.0003, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 7.805668016194332, |
|
"grad_norm": 0.005167273338884115, |
|
"learning_rate": 8.676089017429246e-05, |
|
"loss": 0.0004, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 7.813765182186235, |
|
"grad_norm": 0.005380601156502962, |
|
"learning_rate": 8.672238675067779e-05, |
|
"loss": 0.0005, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 7.821862348178137, |
|
"grad_norm": 0.008584062568843365, |
|
"learning_rate": 8.668383598880419e-05, |
|
"loss": 0.0004, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 7.82995951417004, |
|
"grad_norm": 0.004554002545773983, |
|
"learning_rate": 8.664523793836688e-05, |
|
"loss": 0.0004, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 7.838056680161944, |
|
"grad_norm": 0.006077317520976067, |
|
"learning_rate": 8.660659264912202e-05, |
|
"loss": 0.0003, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 7.846153846153846, |
|
"grad_norm": 0.005693916697055101, |
|
"learning_rate": 8.656790017088659e-05, |
|
"loss": 0.0003, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 7.854251012145749, |
|
"grad_norm": 0.006276635453104973, |
|
"learning_rate": 8.652916055353852e-05, |
|
"loss": 0.0005, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.862348178137652, |
|
"grad_norm": 0.006607879418879747, |
|
"learning_rate": 8.649037384701643e-05, |
|
"loss": 0.0003, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 7.870445344129554, |
|
"grad_norm": 0.005511141382157803, |
|
"learning_rate": 8.645154010131968e-05, |
|
"loss": 0.0004, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 7.8785425101214575, |
|
"grad_norm": 0.004318153951317072, |
|
"learning_rate": 8.641265936650824e-05, |
|
"loss": 0.0003, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 7.886639676113361, |
|
"grad_norm": 0.007693867664784193, |
|
"learning_rate": 8.637373169270264e-05, |
|
"loss": 0.0004, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 7.894736842105263, |
|
"grad_norm": 0.005701543763279915, |
|
"learning_rate": 8.633475713008396e-05, |
|
"loss": 0.0005, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 7.894736842105263, |
|
"eval_loss": 0.0013676926027983427, |
|
"eval_runtime": 20.8821, |
|
"eval_samples_per_second": 4.789, |
|
"eval_steps_per_second": 1.197, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 7.902834008097166, |
|
"grad_norm": 0.003408844815567136, |
|
"learning_rate": 8.62957357288937e-05, |
|
"loss": 0.0002, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 7.910931174089069, |
|
"grad_norm": 0.005576197523623705, |
|
"learning_rate": 8.625666753943375e-05, |
|
"loss": 0.0003, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 7.919028340080971, |
|
"grad_norm": 0.0071185557171702385, |
|
"learning_rate": 8.62175526120663e-05, |
|
"loss": 0.0005, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 7.9271255060728745, |
|
"grad_norm": 0.0053787208162248135, |
|
"learning_rate": 8.617839099721379e-05, |
|
"loss": 0.0004, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 7.935222672064778, |
|
"grad_norm": 0.002727283863350749, |
|
"learning_rate": 8.613918274535884e-05, |
|
"loss": 0.0002, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 7.94331983805668, |
|
"grad_norm": 0.003927669022232294, |
|
"learning_rate": 8.609992790704424e-05, |
|
"loss": 0.0003, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 7.951417004048583, |
|
"grad_norm": 0.0018980104941874743, |
|
"learning_rate": 8.606062653287276e-05, |
|
"loss": 0.0002, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 7.959514170040486, |
|
"grad_norm": 0.004018806852400303, |
|
"learning_rate": 8.60212786735072e-05, |
|
"loss": 0.0004, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 7.967611336032388, |
|
"grad_norm": 0.0028705436270684004, |
|
"learning_rate": 8.598188437967027e-05, |
|
"loss": 0.0003, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 7.9757085020242915, |
|
"grad_norm": 0.0033207128290086985, |
|
"learning_rate": 8.594244370214455e-05, |
|
"loss": 0.0002, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 7.983805668016195, |
|
"grad_norm": 0.0061567616648972034, |
|
"learning_rate": 8.59029566917724e-05, |
|
"loss": 0.0005, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 7.991902834008097, |
|
"grad_norm": 0.005984210874885321, |
|
"learning_rate": 8.58634233994559e-05, |
|
"loss": 0.0005, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.004795044660568237, |
|
"learning_rate": 8.582384387615685e-05, |
|
"loss": 0.0005, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 8.008097165991902, |
|
"grad_norm": 0.003854956943541765, |
|
"learning_rate": 8.578421817289654e-05, |
|
"loss": 0.0004, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 8.016194331983806, |
|
"grad_norm": 0.001711231074295938, |
|
"learning_rate": 8.57445463407559e-05, |
|
"loss": 0.0002, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 8.024291497975709, |
|
"grad_norm": 0.0018254719907417893, |
|
"learning_rate": 8.570482843087524e-05, |
|
"loss": 0.0002, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 8.03238866396761, |
|
"grad_norm": 0.0035706062335520983, |
|
"learning_rate": 8.566506449445432e-05, |
|
"loss": 0.0002, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 8.040485829959515, |
|
"grad_norm": 0.00326129631139338, |
|
"learning_rate": 8.562525458275219e-05, |
|
"loss": 0.0002, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 8.048582995951417, |
|
"grad_norm": 0.004104081075638533, |
|
"learning_rate": 8.558539874708722e-05, |
|
"loss": 0.0003, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 8.05668016194332, |
|
"grad_norm": 0.0026021164376288652, |
|
"learning_rate": 8.554549703883692e-05, |
|
"loss": 0.0003, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 8.064777327935223, |
|
"grad_norm": 0.00469414284452796, |
|
"learning_rate": 8.550554950943798e-05, |
|
"loss": 0.0003, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 8.072874493927126, |
|
"grad_norm": 0.012351175770163536, |
|
"learning_rate": 8.546555621038613e-05, |
|
"loss": 0.0005, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 8.080971659919028, |
|
"grad_norm": 0.005616688635200262, |
|
"learning_rate": 8.542551719323613e-05, |
|
"loss": 0.0003, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 8.089068825910932, |
|
"grad_norm": 0.0016029590042307973, |
|
"learning_rate": 8.538543250960164e-05, |
|
"loss": 0.0002, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 8.097165991902834, |
|
"grad_norm": 0.0028078278992325068, |
|
"learning_rate": 8.534530221115519e-05, |
|
"loss": 0.0003, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 8.097165991902834, |
|
"eval_loss": 0.001421115593984723, |
|
"eval_runtime": 20.8707, |
|
"eval_samples_per_second": 4.791, |
|
"eval_steps_per_second": 1.198, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 8.105263157894736, |
|
"grad_norm": 0.004125694278627634, |
|
"learning_rate": 8.530512634962817e-05, |
|
"loss": 0.0003, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 8.11336032388664, |
|
"grad_norm": 0.0021855218801647425, |
|
"learning_rate": 8.526490497681063e-05, |
|
"loss": 0.0002, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 8.121457489878543, |
|
"grad_norm": 0.0038041502702981234, |
|
"learning_rate": 8.52246381445513e-05, |
|
"loss": 0.0003, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 8.129554655870445, |
|
"grad_norm": 0.006479162722826004, |
|
"learning_rate": 8.518432590475756e-05, |
|
"loss": 0.0003, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 8.137651821862349, |
|
"grad_norm": 0.004671668168157339, |
|
"learning_rate": 8.514396830939528e-05, |
|
"loss": 0.0005, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 8.145748987854251, |
|
"grad_norm": 0.0065527367405593395, |
|
"learning_rate": 8.51035654104888e-05, |
|
"loss": 0.0005, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 8.153846153846153, |
|
"grad_norm": 0.0015392835484817624, |
|
"learning_rate": 8.50631172601209e-05, |
|
"loss": 0.0002, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 8.161943319838057, |
|
"grad_norm": 0.003252090886235237, |
|
"learning_rate": 8.502262391043264e-05, |
|
"loss": 0.0003, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 8.17004048582996, |
|
"grad_norm": 0.007202859967947006, |
|
"learning_rate": 8.498208541362335e-05, |
|
"loss": 0.0004, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 8.178137651821862, |
|
"grad_norm": 0.005900178104639053, |
|
"learning_rate": 8.494150182195062e-05, |
|
"loss": 0.0004, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 8.186234817813766, |
|
"grad_norm": 0.0021305892150849104, |
|
"learning_rate": 8.49008731877301e-05, |
|
"loss": 0.0002, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 8.194331983805668, |
|
"grad_norm": 0.0073636360466480255, |
|
"learning_rate": 8.486019956333555e-05, |
|
"loss": 0.0003, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 8.20242914979757, |
|
"grad_norm": 0.006871379911899567, |
|
"learning_rate": 8.48194810011987e-05, |
|
"loss": 0.0006, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 8.210526315789474, |
|
"grad_norm": 0.004495650064200163, |
|
"learning_rate": 8.47787175538092e-05, |
|
"loss": 0.0003, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 8.218623481781377, |
|
"grad_norm": 0.008418884128332138, |
|
"learning_rate": 8.47379092737146e-05, |
|
"loss": 0.0004, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 8.226720647773279, |
|
"grad_norm": 0.0037393553648144007, |
|
"learning_rate": 8.46970562135202e-05, |
|
"loss": 0.0003, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 8.234817813765183, |
|
"grad_norm": 0.003387110074982047, |
|
"learning_rate": 8.465615842588908e-05, |
|
"loss": 0.0003, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 8.242914979757085, |
|
"grad_norm": 0.00927880872040987, |
|
"learning_rate": 8.46152159635419e-05, |
|
"loss": 0.0005, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 8.251012145748987, |
|
"grad_norm": 0.0031711291521787643, |
|
"learning_rate": 8.457422887925698e-05, |
|
"loss": 0.0002, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 8.259109311740891, |
|
"grad_norm": 0.00468886224552989, |
|
"learning_rate": 8.453319722587014e-05, |
|
"loss": 0.0003, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 8.267206477732794, |
|
"grad_norm": 0.0016716530080884695, |
|
"learning_rate": 8.449212105627464e-05, |
|
"loss": 0.0002, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 8.275303643724696, |
|
"grad_norm": 0.005183354951441288, |
|
"learning_rate": 8.445100042342111e-05, |
|
"loss": 0.0002, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 8.2834008097166, |
|
"grad_norm": 0.006208460312336683, |
|
"learning_rate": 8.440983538031754e-05, |
|
"loss": 0.0005, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 8.291497975708502, |
|
"grad_norm": 0.005015834234654903, |
|
"learning_rate": 8.436862598002917e-05, |
|
"loss": 0.0003, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 8.299595141700404, |
|
"grad_norm": 0.00472809886559844, |
|
"learning_rate": 8.432737227567836e-05, |
|
"loss": 0.0003, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 8.299595141700404, |
|
"eval_loss": 0.0013881891500204802, |
|
"eval_runtime": 20.8543, |
|
"eval_samples_per_second": 4.795, |
|
"eval_steps_per_second": 1.199, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 8.307692307692308, |
|
"grad_norm": 0.0067214383743703365, |
|
"learning_rate": 8.428607432044464e-05, |
|
"loss": 0.0003, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 8.31578947368421, |
|
"grad_norm": 0.0032693762332201004, |
|
"learning_rate": 8.424473216756456e-05, |
|
"loss": 0.0002, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 8.323886639676113, |
|
"grad_norm": 0.003940957598388195, |
|
"learning_rate": 8.420334587033164e-05, |
|
"loss": 0.0002, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 8.331983805668017, |
|
"grad_norm": 0.0030299958307296038, |
|
"learning_rate": 8.416191548209634e-05, |
|
"loss": 0.0003, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 8.34008097165992, |
|
"grad_norm": 0.006264000199735165, |
|
"learning_rate": 8.412044105626588e-05, |
|
"loss": 0.0003, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 8.348178137651821, |
|
"grad_norm": 0.005418987013399601, |
|
"learning_rate": 8.407892264630435e-05, |
|
"loss": 0.0003, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 8.356275303643725, |
|
"grad_norm": 0.004369835369288921, |
|
"learning_rate": 8.403736030573246e-05, |
|
"loss": 0.0003, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 8.364372469635628, |
|
"grad_norm": 0.0046693203039467335, |
|
"learning_rate": 8.399575408812759e-05, |
|
"loss": 0.0002, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 8.37246963562753, |
|
"grad_norm": 0.006310211028903723, |
|
"learning_rate": 8.395410404712366e-05, |
|
"loss": 0.0003, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 8.380566801619434, |
|
"grad_norm": 0.005021234508603811, |
|
"learning_rate": 8.39124102364111e-05, |
|
"loss": 0.0002, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 8.388663967611336, |
|
"grad_norm": 0.006567994132637978, |
|
"learning_rate": 8.387067270973676e-05, |
|
"loss": 0.0003, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 8.396761133603238, |
|
"grad_norm": 0.003345700679346919, |
|
"learning_rate": 8.382889152090382e-05, |
|
"loss": 0.0003, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 8.404858299595142, |
|
"grad_norm": 0.004254522267729044, |
|
"learning_rate": 8.378706672377177e-05, |
|
"loss": 0.0002, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 8.412955465587045, |
|
"grad_norm": 0.004765935242176056, |
|
"learning_rate": 8.374519837225632e-05, |
|
"loss": 0.0002, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 8.421052631578947, |
|
"grad_norm": 0.004489066544920206, |
|
"learning_rate": 8.370328652032928e-05, |
|
"loss": 0.0003, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 8.429149797570851, |
|
"grad_norm": 0.003664980176836252, |
|
"learning_rate": 8.366133122201861e-05, |
|
"loss": 0.0002, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 8.437246963562753, |
|
"grad_norm": 0.0038972117472440004, |
|
"learning_rate": 8.361933253140821e-05, |
|
"loss": 0.0003, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 8.445344129554655, |
|
"grad_norm": 0.006108574103564024, |
|
"learning_rate": 8.357729050263794e-05, |
|
"loss": 0.0003, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 8.45344129554656, |
|
"grad_norm": 0.005771995056420565, |
|
"learning_rate": 8.353520518990353e-05, |
|
"loss": 0.0003, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 8.461538461538462, |
|
"grad_norm": 0.0028857016004621983, |
|
"learning_rate": 8.34930766474565e-05, |
|
"loss": 0.0002, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 8.469635627530364, |
|
"grad_norm": 0.002775567816570401, |
|
"learning_rate": 8.34509049296041e-05, |
|
"loss": 0.0002, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 8.477732793522268, |
|
"grad_norm": 0.00725373113527894, |
|
"learning_rate": 8.340869009070924e-05, |
|
"loss": 0.0005, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 8.48582995951417, |
|
"grad_norm": 0.008823963813483715, |
|
"learning_rate": 8.336643218519043e-05, |
|
"loss": 0.0005, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 8.493927125506072, |
|
"grad_norm": 0.00491480203345418, |
|
"learning_rate": 8.332413126752165e-05, |
|
"loss": 0.0003, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 8.502024291497976, |
|
"grad_norm": 0.00424035731703043, |
|
"learning_rate": 8.328178739223238e-05, |
|
"loss": 0.0003, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 8.502024291497976, |
|
"eval_loss": 0.001589785679243505, |
|
"eval_runtime": 20.8531, |
|
"eval_samples_per_second": 4.795, |
|
"eval_steps_per_second": 1.199, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 8.510121457489879, |
|
"grad_norm": 0.00579440500587225, |
|
"learning_rate": 8.323940061390745e-05, |
|
"loss": 0.0003, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 8.518218623481781, |
|
"grad_norm": 0.008121831342577934, |
|
"learning_rate": 8.319697098718697e-05, |
|
"loss": 0.0005, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 8.526315789473685, |
|
"grad_norm": 0.003369817277416587, |
|
"learning_rate": 8.315449856676636e-05, |
|
"loss": 0.0003, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 8.534412955465587, |
|
"grad_norm": 0.00540441507473588, |
|
"learning_rate": 8.311198340739612e-05, |
|
"loss": 0.0003, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 8.54251012145749, |
|
"grad_norm": 0.0026687076315283775, |
|
"learning_rate": 8.306942556388189e-05, |
|
"loss": 0.0002, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 8.550607287449393, |
|
"grad_norm": 0.007054275833070278, |
|
"learning_rate": 8.302682509108435e-05, |
|
"loss": 0.0004, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 8.558704453441296, |
|
"grad_norm": 0.002858961233869195, |
|
"learning_rate": 8.298418204391907e-05, |
|
"loss": 0.0002, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 8.566801619433198, |
|
"grad_norm": 0.005906047765165567, |
|
"learning_rate": 8.294149647735659e-05, |
|
"loss": 0.0005, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 8.574898785425102, |
|
"grad_norm": 0.0035569374449551105, |
|
"learning_rate": 8.289876844642215e-05, |
|
"loss": 0.0003, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 8.582995951417004, |
|
"grad_norm": 0.004486700054258108, |
|
"learning_rate": 8.285599800619584e-05, |
|
"loss": 0.0003, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 8.591093117408906, |
|
"grad_norm": 0.003070216393098235, |
|
"learning_rate": 8.281318521181234e-05, |
|
"loss": 0.0003, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 8.59919028340081, |
|
"grad_norm": 0.0035653486847877502, |
|
"learning_rate": 8.277033011846099e-05, |
|
"loss": 0.0004, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 8.607287449392713, |
|
"grad_norm": 0.004825572948902845, |
|
"learning_rate": 8.27274327813856e-05, |
|
"loss": 0.0003, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 8.615384615384615, |
|
"grad_norm": 0.0030901050195097923, |
|
"learning_rate": 8.268449325588447e-05, |
|
"loss": 0.0002, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 8.623481781376519, |
|
"grad_norm": 0.004419588949531317, |
|
"learning_rate": 8.264151159731029e-05, |
|
"loss": 0.0003, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 8.631578947368421, |
|
"grad_norm": 0.0031760812271386385, |
|
"learning_rate": 8.259848786107003e-05, |
|
"loss": 0.0003, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 8.639676113360323, |
|
"grad_norm": 0.009143234230577946, |
|
"learning_rate": 8.25554221026249e-05, |
|
"loss": 0.0004, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 8.647773279352228, |
|
"grad_norm": 0.0034755307715386152, |
|
"learning_rate": 8.251231437749036e-05, |
|
"loss": 0.0003, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 8.65587044534413, |
|
"grad_norm": 0.008503805845975876, |
|
"learning_rate": 8.246916474123586e-05, |
|
"loss": 0.0003, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 8.663967611336032, |
|
"grad_norm": 0.0027896000538021326, |
|
"learning_rate": 8.242597324948496e-05, |
|
"loss": 0.0003, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 8.672064777327936, |
|
"grad_norm": 0.003082460490986705, |
|
"learning_rate": 8.23827399579151e-05, |
|
"loss": 0.0002, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 8.680161943319838, |
|
"grad_norm": 0.004392626229673624, |
|
"learning_rate": 8.233946492225769e-05, |
|
"loss": 0.0004, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 8.68825910931174, |
|
"grad_norm": 0.0028719629626721144, |
|
"learning_rate": 8.229614819829787e-05, |
|
"loss": 0.0002, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 8.696356275303645, |
|
"grad_norm": 0.00670055765658617, |
|
"learning_rate": 8.225278984187459e-05, |
|
"loss": 0.0004, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 8.704453441295547, |
|
"grad_norm": 0.0042137037962675095, |
|
"learning_rate": 8.220938990888041e-05, |
|
"loss": 0.0003, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 8.704453441295547, |
|
"eval_loss": 0.0013522603549063206, |
|
"eval_runtime": 20.8761, |
|
"eval_samples_per_second": 4.79, |
|
"eval_steps_per_second": 1.198, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 8.712550607287449, |
|
"grad_norm": 0.004317829851061106, |
|
"learning_rate": 8.216594845526154e-05, |
|
"loss": 0.0002, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 8.720647773279353, |
|
"grad_norm": 0.0006851058569736779, |
|
"learning_rate": 8.212246553701764e-05, |
|
"loss": 0.0002, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 8.728744939271255, |
|
"grad_norm": 0.003451045835390687, |
|
"learning_rate": 8.207894121020188e-05, |
|
"loss": 0.0002, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 8.736842105263158, |
|
"grad_norm": 0.004197864327579737, |
|
"learning_rate": 8.203537553092081e-05, |
|
"loss": 0.0003, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 8.744939271255062, |
|
"grad_norm": 0.003602301701903343, |
|
"learning_rate": 8.199176855533426e-05, |
|
"loss": 0.0002, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 8.753036437246964, |
|
"grad_norm": 0.004492857493460178, |
|
"learning_rate": 8.194812033965532e-05, |
|
"loss": 0.0004, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 8.761133603238866, |
|
"grad_norm": 0.006654155440628529, |
|
"learning_rate": 8.190443094015022e-05, |
|
"loss": 0.0004, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 8.76923076923077, |
|
"grad_norm": 0.0013280883431434631, |
|
"learning_rate": 8.186070041313827e-05, |
|
"loss": 0.0002, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 8.777327935222672, |
|
"grad_norm": 0.003171891439706087, |
|
"learning_rate": 8.181692881499183e-05, |
|
"loss": 0.0002, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 8.785425101214575, |
|
"grad_norm": 0.005619837902486324, |
|
"learning_rate": 8.177311620213617e-05, |
|
"loss": 0.0003, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 8.793522267206479, |
|
"grad_norm": 0.0074705081060528755, |
|
"learning_rate": 8.172926263104949e-05, |
|
"loss": 0.0003, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 8.80161943319838, |
|
"grad_norm": 0.003815494477748871, |
|
"learning_rate": 8.168536815826271e-05, |
|
"loss": 0.0002, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 8.809716599190283, |
|
"grad_norm": 0.004843059927225113, |
|
"learning_rate": 8.164143284035953e-05, |
|
"loss": 0.0003, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 8.817813765182187, |
|
"grad_norm": 0.006158561911433935, |
|
"learning_rate": 8.159745673397628e-05, |
|
"loss": 0.0003, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 8.82591093117409, |
|
"grad_norm": 0.003568399930372834, |
|
"learning_rate": 8.155343989580187e-05, |
|
"loss": 0.0004, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 8.834008097165992, |
|
"grad_norm": 0.012608661316335201, |
|
"learning_rate": 8.150938238257773e-05, |
|
"loss": 0.0003, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 8.842105263157894, |
|
"grad_norm": 0.00646247249096632, |
|
"learning_rate": 8.146528425109772e-05, |
|
"loss": 0.0004, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 8.850202429149798, |
|
"grad_norm": 0.004845927469432354, |
|
"learning_rate": 8.142114555820807e-05, |
|
"loss": 0.0003, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 8.8582995951417, |
|
"grad_norm": 0.00976946298032999, |
|
"learning_rate": 8.137696636080725e-05, |
|
"loss": 0.0006, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 8.866396761133604, |
|
"grad_norm": 0.003078792942687869, |
|
"learning_rate": 8.1332746715846e-05, |
|
"loss": 0.0003, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 8.874493927125506, |
|
"grad_norm": 0.0021008781623095274, |
|
"learning_rate": 8.12884866803272e-05, |
|
"loss": 0.0003, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 8.882591093117409, |
|
"grad_norm": 0.010477488860487938, |
|
"learning_rate": 8.124418631130572e-05, |
|
"loss": 0.0003, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 8.89068825910931, |
|
"grad_norm": 0.004115893505513668, |
|
"learning_rate": 8.119984566588852e-05, |
|
"loss": 0.0003, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 8.898785425101215, |
|
"grad_norm": 0.007485564332455397, |
|
"learning_rate": 8.115546480123443e-05, |
|
"loss": 0.0003, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 8.906882591093117, |
|
"grad_norm": 0.006476998329162598, |
|
"learning_rate": 8.111104377455412e-05, |
|
"loss": 0.0003, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 8.906882591093117, |
|
"eval_loss": 0.001462434884160757, |
|
"eval_runtime": 20.9128, |
|
"eval_samples_per_second": 4.782, |
|
"eval_steps_per_second": 1.195, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 8.914979757085021, |
|
"grad_norm": 0.003973628859966993, |
|
"learning_rate": 8.106658264311007e-05, |
|
"loss": 0.0002, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 8.923076923076923, |
|
"grad_norm": 0.00737798260524869, |
|
"learning_rate": 8.102208146421642e-05, |
|
"loss": 0.0003, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 8.931174089068826, |
|
"grad_norm": 0.005106086377054453, |
|
"learning_rate": 8.097754029523892e-05, |
|
"loss": 0.0003, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 8.939271255060728, |
|
"grad_norm": 0.0030819557141512632, |
|
"learning_rate": 8.093295919359496e-05, |
|
"loss": 0.0003, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 8.947368421052632, |
|
"grad_norm": 0.006060482934117317, |
|
"learning_rate": 8.08883382167533e-05, |
|
"loss": 0.0003, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 8.955465587044534, |
|
"grad_norm": 0.0065495348535478115, |
|
"learning_rate": 8.084367742223418e-05, |
|
"loss": 0.0003, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 8.963562753036438, |
|
"grad_norm": 0.004741148557513952, |
|
"learning_rate": 8.079897686760911e-05, |
|
"loss": 0.0004, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 8.97165991902834, |
|
"grad_norm": 0.005379111971706152, |
|
"learning_rate": 8.07542366105009e-05, |
|
"loss": 0.0004, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 8.979757085020243, |
|
"grad_norm": 0.0065238396637141705, |
|
"learning_rate": 8.070945670858352e-05, |
|
"loss": 0.0002, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 8.987854251012145, |
|
"grad_norm": 0.003318206174299121, |
|
"learning_rate": 8.066463721958204e-05, |
|
"loss": 0.0003, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 8.995951417004049, |
|
"grad_norm": 0.006058558821678162, |
|
"learning_rate": 8.061977820127256e-05, |
|
"loss": 0.0004, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 9.004048582995951, |
|
"grad_norm": 0.008386258967220783, |
|
"learning_rate": 8.057487971148216e-05, |
|
"loss": 0.0004, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 9.012145748987853, |
|
"grad_norm": 0.0031854738481342793, |
|
"learning_rate": 8.052994180808877e-05, |
|
"loss": 0.0002, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 9.020242914979757, |
|
"grad_norm": 0.004263192415237427, |
|
"learning_rate": 8.048496454902116e-05, |
|
"loss": 0.0003, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 9.02834008097166, |
|
"grad_norm": 0.005687515716999769, |
|
"learning_rate": 8.043994799225882e-05, |
|
"loss": 0.0003, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 9.036437246963562, |
|
"grad_norm": 0.00459116417914629, |
|
"learning_rate": 8.039489219583187e-05, |
|
"loss": 0.0002, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 9.044534412955466, |
|
"grad_norm": 0.002290780423209071, |
|
"learning_rate": 8.034979721782108e-05, |
|
"loss": 0.0003, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 9.052631578947368, |
|
"grad_norm": 0.0025161972735077143, |
|
"learning_rate": 8.030466311635762e-05, |
|
"loss": 0.0002, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 9.06072874493927, |
|
"grad_norm": 0.005804365035146475, |
|
"learning_rate": 8.025948994962322e-05, |
|
"loss": 0.0004, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 9.068825910931174, |
|
"grad_norm": 0.003750093514099717, |
|
"learning_rate": 8.02142777758499e-05, |
|
"loss": 0.0003, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 9.076923076923077, |
|
"grad_norm": 0.0025203884579241276, |
|
"learning_rate": 8.016902665331994e-05, |
|
"loss": 0.0002, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 9.085020242914979, |
|
"grad_norm": 0.008073766715824604, |
|
"learning_rate": 8.01237366403659e-05, |
|
"loss": 0.0005, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 9.093117408906883, |
|
"grad_norm": 0.005724397487938404, |
|
"learning_rate": 8.007840779537039e-05, |
|
"loss": 0.0003, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 9.101214574898785, |
|
"grad_norm": 0.005232294090092182, |
|
"learning_rate": 8.003304017676615e-05, |
|
"loss": 0.0003, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 9.109311740890687, |
|
"grad_norm": 0.006211146246641874, |
|
"learning_rate": 7.998763384303587e-05, |
|
"loss": 0.0003, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 9.109311740890687, |
|
"eval_loss": 0.001377474400214851, |
|
"eval_runtime": 20.882, |
|
"eval_samples_per_second": 4.789, |
|
"eval_steps_per_second": 1.197, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 9.117408906882591, |
|
"grad_norm": 0.003781067207455635, |
|
"learning_rate": 7.994218885271214e-05, |
|
"loss": 0.0003, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 9.125506072874494, |
|
"grad_norm": 0.0043622152879834175, |
|
"learning_rate": 7.98967052643774e-05, |
|
"loss": 0.0002, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 9.133603238866396, |
|
"grad_norm": 0.0023519883397966623, |
|
"learning_rate": 7.985118313666384e-05, |
|
"loss": 0.0003, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 9.1417004048583, |
|
"grad_norm": 0.008506176061928272, |
|
"learning_rate": 7.980562252825332e-05, |
|
"loss": 0.0002, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 9.149797570850202, |
|
"grad_norm": 0.0056921737268567085, |
|
"learning_rate": 7.976002349787732e-05, |
|
"loss": 0.0003, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 9.157894736842104, |
|
"grad_norm": 0.0015705007826909423, |
|
"learning_rate": 7.971438610431684e-05, |
|
"loss": 0.0002, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 9.165991902834008, |
|
"grad_norm": 0.0007020162884145975, |
|
"learning_rate": 7.966871040640233e-05, |
|
"loss": 0.0002, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 9.17408906882591, |
|
"grad_norm": 0.004871773067861795, |
|
"learning_rate": 7.962299646301363e-05, |
|
"loss": 0.0002, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 9.182186234817813, |
|
"grad_norm": 0.004164962098002434, |
|
"learning_rate": 7.957724433307989e-05, |
|
"loss": 0.0003, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 9.190283400809717, |
|
"grad_norm": 0.005270279943943024, |
|
"learning_rate": 7.953145407557943e-05, |
|
"loss": 0.0003, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 9.19838056680162, |
|
"grad_norm": 0.003013407811522484, |
|
"learning_rate": 7.948562574953982e-05, |
|
"loss": 0.0003, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 9.206477732793521, |
|
"grad_norm": 0.005532771814614534, |
|
"learning_rate": 7.943975941403758e-05, |
|
"loss": 0.0003, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 9.214574898785425, |
|
"grad_norm": 0.003702058456838131, |
|
"learning_rate": 7.939385512819833e-05, |
|
"loss": 0.0005, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 9.222672064777328, |
|
"grad_norm": 0.007236842531710863, |
|
"learning_rate": 7.934791295119657e-05, |
|
"loss": 0.0003, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 0.0032832149881869555, |
|
"learning_rate": 7.930193294225563e-05, |
|
"loss": 0.0003, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 9.238866396761134, |
|
"grad_norm": 0.009601681493222713, |
|
"learning_rate": 7.925591516064763e-05, |
|
"loss": 0.0004, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 9.246963562753036, |
|
"grad_norm": 0.0053932759910821915, |
|
"learning_rate": 7.920985966569342e-05, |
|
"loss": 0.0005, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 9.255060728744938, |
|
"grad_norm": 0.0016843380872160196, |
|
"learning_rate": 7.916376651676234e-05, |
|
"loss": 0.0002, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 9.263157894736842, |
|
"grad_norm": 0.0068874419666826725, |
|
"learning_rate": 7.911763577327243e-05, |
|
"loss": 0.0003, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 9.271255060728745, |
|
"grad_norm": 0.003599689109250903, |
|
"learning_rate": 7.907146749469007e-05, |
|
"loss": 0.0003, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 9.279352226720647, |
|
"grad_norm": 0.003752979449927807, |
|
"learning_rate": 7.902526174053011e-05, |
|
"loss": 0.0003, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 9.287449392712551, |
|
"grad_norm": 0.0033392952755093575, |
|
"learning_rate": 7.897901857035564e-05, |
|
"loss": 0.0003, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 9.295546558704453, |
|
"grad_norm": 0.001785742468200624, |
|
"learning_rate": 7.893273804377803e-05, |
|
"loss": 0.0002, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 9.303643724696355, |
|
"grad_norm": 0.0025425672065466642, |
|
"learning_rate": 7.888642022045677e-05, |
|
"loss": 0.0002, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 9.31174089068826, |
|
"grad_norm": 0.004378551617264748, |
|
"learning_rate": 7.884006516009947e-05, |
|
"loss": 0.0004, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 9.31174089068826, |
|
"eval_loss": 0.0015521374298259616, |
|
"eval_runtime": 20.8671, |
|
"eval_samples_per_second": 4.792, |
|
"eval_steps_per_second": 1.198, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 9.319838056680162, |
|
"grad_norm": 0.004232253413647413, |
|
"learning_rate": 7.879367292246169e-05, |
|
"loss": 0.0002, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 9.327935222672064, |
|
"grad_norm": 0.00398236233741045, |
|
"learning_rate": 7.874724356734698e-05, |
|
"loss": 0.0002, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 9.336032388663968, |
|
"grad_norm": 0.006320311687886715, |
|
"learning_rate": 7.870077715460666e-05, |
|
"loss": 0.0005, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 9.34412955465587, |
|
"grad_norm": 0.005346038844436407, |
|
"learning_rate": 7.865427374413991e-05, |
|
"loss": 0.0003, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 9.352226720647772, |
|
"grad_norm": 0.00476599158719182, |
|
"learning_rate": 7.860773339589351e-05, |
|
"loss": 0.0003, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 9.360323886639677, |
|
"grad_norm": 0.0034629430156201124, |
|
"learning_rate": 7.856115616986194e-05, |
|
"loss": 0.0003, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 9.368421052631579, |
|
"grad_norm": 0.0017381705110892653, |
|
"learning_rate": 7.851454212608715e-05, |
|
"loss": 0.0002, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 9.376518218623481, |
|
"grad_norm": 0.0038830083794891834, |
|
"learning_rate": 7.846789132465858e-05, |
|
"loss": 0.0002, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 9.384615384615385, |
|
"grad_norm": 0.0028537509497255087, |
|
"learning_rate": 7.842120382571308e-05, |
|
"loss": 0.0003, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 9.392712550607287, |
|
"grad_norm": 0.0017488327575847507, |
|
"learning_rate": 7.837447968943474e-05, |
|
"loss": 0.0002, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 9.40080971659919, |
|
"grad_norm": 0.0021484007593244314, |
|
"learning_rate": 7.832771897605496e-05, |
|
"loss": 0.0002, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 9.408906882591094, |
|
"grad_norm": 0.003066607750952244, |
|
"learning_rate": 7.828092174585221e-05, |
|
"loss": 0.0002, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 9.417004048582996, |
|
"grad_norm": 0.001146377413533628, |
|
"learning_rate": 7.823408805915212e-05, |
|
"loss": 0.0002, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 9.425101214574898, |
|
"grad_norm": 0.004038800951093435, |
|
"learning_rate": 7.818721797632724e-05, |
|
"loss": 0.0002, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 9.433198380566802, |
|
"grad_norm": 0.004361163824796677, |
|
"learning_rate": 7.814031155779708e-05, |
|
"loss": 0.0003, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 9.441295546558704, |
|
"grad_norm": 0.004912042990326881, |
|
"learning_rate": 7.809336886402796e-05, |
|
"loss": 0.0003, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 9.449392712550607, |
|
"grad_norm": 0.0021194296423345804, |
|
"learning_rate": 7.804638995553297e-05, |
|
"loss": 0.0003, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 9.45748987854251, |
|
"grad_norm": 0.004582180175930262, |
|
"learning_rate": 7.799937489287192e-05, |
|
"loss": 0.0003, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 9.465587044534413, |
|
"grad_norm": 0.0019999807700514793, |
|
"learning_rate": 7.79523237366512e-05, |
|
"loss": 0.0002, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 9.473684210526315, |
|
"grad_norm": 0.0027365379501134157, |
|
"learning_rate": 7.79052365475237e-05, |
|
"loss": 0.0003, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 9.481781376518219, |
|
"grad_norm": 0.003132071578875184, |
|
"learning_rate": 7.785811338618878e-05, |
|
"loss": 0.0002, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 9.489878542510121, |
|
"grad_norm": 0.004992193076759577, |
|
"learning_rate": 7.781095431339221e-05, |
|
"loss": 0.0002, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 9.497975708502024, |
|
"grad_norm": 0.005324013065546751, |
|
"learning_rate": 7.776375938992599e-05, |
|
"loss": 0.0004, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 9.506072874493928, |
|
"grad_norm": 0.0014854903565719724, |
|
"learning_rate": 7.771652867662838e-05, |
|
"loss": 0.0002, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 9.51417004048583, |
|
"grad_norm": 0.0030039751436561346, |
|
"learning_rate": 7.766926223438375e-05, |
|
"loss": 0.0002, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 9.51417004048583, |
|
"eval_loss": 0.001370253972709179, |
|
"eval_runtime": 20.9183, |
|
"eval_samples_per_second": 4.781, |
|
"eval_steps_per_second": 1.195, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 9.522267206477732, |
|
"grad_norm": 0.0022788650821894407, |
|
"learning_rate": 7.762196012412255e-05, |
|
"loss": 0.0002, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 9.530364372469636, |
|
"grad_norm": 0.006257211789488792, |
|
"learning_rate": 7.757462240682119e-05, |
|
"loss": 0.0003, |
|
"step": 1177 |
|
}, |
|
{ |
|
"epoch": 9.538461538461538, |
|
"grad_norm": 0.0036694249138236046, |
|
"learning_rate": 7.752724914350196e-05, |
|
"loss": 0.0003, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 9.54655870445344, |
|
"grad_norm": 0.0010751505615189672, |
|
"learning_rate": 7.747984039523304e-05, |
|
"loss": 0.0002, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 9.554655870445345, |
|
"grad_norm": 0.002307455288246274, |
|
"learning_rate": 7.74323962231283e-05, |
|
"loss": 0.0002, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 9.562753036437247, |
|
"grad_norm": 0.0009123678901232779, |
|
"learning_rate": 7.738491668834726e-05, |
|
"loss": 0.0001, |
|
"step": 1181 |
|
}, |
|
{ |
|
"epoch": 9.570850202429149, |
|
"grad_norm": 0.009321003220975399, |
|
"learning_rate": 7.733740185209508e-05, |
|
"loss": 0.0003, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 9.578947368421053, |
|
"grad_norm": 0.0018089297227561474, |
|
"learning_rate": 7.728985177562239e-05, |
|
"loss": 0.0002, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 9.587044534412955, |
|
"grad_norm": 0.004776486661285162, |
|
"learning_rate": 7.724226652022526e-05, |
|
"loss": 0.0003, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 9.595141700404858, |
|
"grad_norm": 0.0017958278767764568, |
|
"learning_rate": 7.71946461472451e-05, |
|
"loss": 0.0002, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 9.603238866396762, |
|
"grad_norm": 0.0028115343302488327, |
|
"learning_rate": 7.714699071806859e-05, |
|
"loss": 0.0002, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 9.611336032388664, |
|
"grad_norm": 0.0017144676530733705, |
|
"learning_rate": 7.709930029412762e-05, |
|
"loss": 0.0002, |
|
"step": 1187 |
|
}, |
|
{ |
|
"epoch": 9.619433198380566, |
|
"grad_norm": 0.0012224303791299462, |
|
"learning_rate": 7.705157493689915e-05, |
|
"loss": 0.0002, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 9.62753036437247, |
|
"grad_norm": 0.0017139979172497988, |
|
"learning_rate": 7.70038147079052e-05, |
|
"loss": 0.0002, |
|
"step": 1189 |
|
}, |
|
{ |
|
"epoch": 9.635627530364372, |
|
"grad_norm": 0.00275692087598145, |
|
"learning_rate": 7.695601966871277e-05, |
|
"loss": 0.0002, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 9.643724696356275, |
|
"grad_norm": 0.005676504224538803, |
|
"learning_rate": 7.690818988093367e-05, |
|
"loss": 0.0002, |
|
"step": 1191 |
|
}, |
|
{ |
|
"epoch": 9.651821862348179, |
|
"grad_norm": 0.012121010571718216, |
|
"learning_rate": 7.686032540622457e-05, |
|
"loss": 0.0002, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 9.65991902834008, |
|
"grad_norm": 0.005054876208305359, |
|
"learning_rate": 7.68124263062868e-05, |
|
"loss": 0.0003, |
|
"step": 1193 |
|
}, |
|
{ |
|
"epoch": 9.668016194331983, |
|
"grad_norm": 0.002249806420877576, |
|
"learning_rate": 7.676449264286633e-05, |
|
"loss": 0.0002, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 9.676113360323887, |
|
"grad_norm": 0.003644303185865283, |
|
"learning_rate": 7.671652447775374e-05, |
|
"loss": 0.0003, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 9.68421052631579, |
|
"grad_norm": 0.001162796514108777, |
|
"learning_rate": 7.666852187278402e-05, |
|
"loss": 0.0001, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 9.692307692307692, |
|
"grad_norm": 0.007950011640787125, |
|
"learning_rate": 7.662048488983658e-05, |
|
"loss": 0.0003, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 9.700404858299596, |
|
"grad_norm": 0.005151614546775818, |
|
"learning_rate": 7.657241359083518e-05, |
|
"loss": 0.0002, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 9.708502024291498, |
|
"grad_norm": 0.0012473291717469692, |
|
"learning_rate": 7.652430803774778e-05, |
|
"loss": 0.0002, |
|
"step": 1199 |
|
}, |
|
{ |
|
"epoch": 9.7165991902834, |
|
"grad_norm": 0.006338824518024921, |
|
"learning_rate": 7.647616829258645e-05, |
|
"loss": 0.0005, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 9.7165991902834, |
|
"eval_loss": 0.001362333190627396, |
|
"eval_runtime": 20.9031, |
|
"eval_samples_per_second": 4.784, |
|
"eval_steps_per_second": 1.196, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 9.724696356275304, |
|
"grad_norm": 0.0037102538626641035, |
|
"learning_rate": 7.642799441740745e-05, |
|
"loss": 0.0002, |
|
"step": 1201 |
|
}, |
|
{ |
|
"epoch": 9.732793522267206, |
|
"grad_norm": 0.005634430330246687, |
|
"learning_rate": 7.637978647431094e-05, |
|
"loss": 0.0004, |
|
"step": 1202 |
|
}, |
|
{ |
|
"epoch": 9.740890688259109, |
|
"grad_norm": 0.0025536268949508667, |
|
"learning_rate": 7.633154452544105e-05, |
|
"loss": 0.0002, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 9.748987854251013, |
|
"grad_norm": 0.003189551178365946, |
|
"learning_rate": 7.628326863298573e-05, |
|
"loss": 0.0003, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 9.757085020242915, |
|
"grad_norm": 0.003228923538699746, |
|
"learning_rate": 7.623495885917666e-05, |
|
"loss": 0.0003, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 9.765182186234817, |
|
"grad_norm": 0.004319958388805389, |
|
"learning_rate": 7.618661526628926e-05, |
|
"loss": 0.0002, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 9.773279352226721, |
|
"grad_norm": 0.0017490462632849813, |
|
"learning_rate": 7.613823791664244e-05, |
|
"loss": 0.0002, |
|
"step": 1207 |
|
}, |
|
{ |
|
"epoch": 9.781376518218623, |
|
"grad_norm": 0.0028828205540776253, |
|
"learning_rate": 7.608982687259876e-05, |
|
"loss": 0.0002, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 9.789473684210526, |
|
"grad_norm": 0.0010670493356883526, |
|
"learning_rate": 7.604138219656411e-05, |
|
"loss": 0.0002, |
|
"step": 1209 |
|
}, |
|
{ |
|
"epoch": 9.79757085020243, |
|
"grad_norm": 0.0030811934266239405, |
|
"learning_rate": 7.599290395098777e-05, |
|
"loss": 0.0004, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 9.805668016194332, |
|
"grad_norm": 0.0027857047971338034, |
|
"learning_rate": 7.594439219836229e-05, |
|
"loss": 0.0002, |
|
"step": 1211 |
|
}, |
|
{ |
|
"epoch": 9.813765182186234, |
|
"grad_norm": 0.006909377872943878, |
|
"learning_rate": 7.589584700122345e-05, |
|
"loss": 0.0004, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 9.821862348178138, |
|
"grad_norm": 0.003193581011146307, |
|
"learning_rate": 7.584726842215009e-05, |
|
"loss": 0.0002, |
|
"step": 1213 |
|
}, |
|
{ |
|
"epoch": 9.82995951417004, |
|
"grad_norm": 0.0016280598938465118, |
|
"learning_rate": 7.579865652376407e-05, |
|
"loss": 0.0002, |
|
"step": 1214 |
|
}, |
|
{ |
|
"epoch": 9.838056680161943, |
|
"grad_norm": 0.002516286913305521, |
|
"learning_rate": 7.57500113687303e-05, |
|
"loss": 0.0003, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 9.846153846153847, |
|
"grad_norm": 0.0036160030867904425, |
|
"learning_rate": 7.570133301975645e-05, |
|
"loss": 0.0003, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 9.854251012145749, |
|
"grad_norm": 0.006766727659851313, |
|
"learning_rate": 7.565262153959301e-05, |
|
"loss": 0.0002, |
|
"step": 1217 |
|
}, |
|
{ |
|
"epoch": 9.862348178137651, |
|
"grad_norm": 0.0024577074218541384, |
|
"learning_rate": 7.560387699103323e-05, |
|
"loss": 0.0002, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 9.870445344129555, |
|
"grad_norm": 0.0015718834474682808, |
|
"learning_rate": 7.555509943691296e-05, |
|
"loss": 0.0002, |
|
"step": 1219 |
|
}, |
|
{ |
|
"epoch": 9.878542510121457, |
|
"grad_norm": 0.0027964310720562935, |
|
"learning_rate": 7.550628894011053e-05, |
|
"loss": 0.0002, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 9.88663967611336, |
|
"grad_norm": 0.007514572236686945, |
|
"learning_rate": 7.545744556354685e-05, |
|
"loss": 0.0004, |
|
"step": 1221 |
|
}, |
|
{ |
|
"epoch": 9.894736842105264, |
|
"grad_norm": 0.009225533343851566, |
|
"learning_rate": 7.540856937018515e-05, |
|
"loss": 0.0003, |
|
"step": 1222 |
|
}, |
|
{ |
|
"epoch": 9.902834008097166, |
|
"grad_norm": 0.004441691096872091, |
|
"learning_rate": 7.535966042303094e-05, |
|
"loss": 0.0003, |
|
"step": 1223 |
|
}, |
|
{ |
|
"epoch": 9.910931174089068, |
|
"grad_norm": 0.0031521234195679426, |
|
"learning_rate": 7.531071878513202e-05, |
|
"loss": 0.0003, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 9.919028340080972, |
|
"grad_norm": 0.0024993985425680876, |
|
"learning_rate": 7.526174451957827e-05, |
|
"loss": 0.0002, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 9.919028340080972, |
|
"eval_loss": 0.0013724053278565407, |
|
"eval_runtime": 20.8813, |
|
"eval_samples_per_second": 4.789, |
|
"eval_steps_per_second": 1.197, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 9.927125506072874, |
|
"grad_norm": 0.0053532798774540424, |
|
"learning_rate": 7.521273768950167e-05, |
|
"loss": 0.0004, |
|
"step": 1226 |
|
}, |
|
{ |
|
"epoch": 9.935222672064777, |
|
"grad_norm": 0.0038510486483573914, |
|
"learning_rate": 7.516369835807615e-05, |
|
"loss": 0.0002, |
|
"step": 1227 |
|
}, |
|
{ |
|
"epoch": 9.94331983805668, |
|
"grad_norm": 0.003887588856741786, |
|
"learning_rate": 7.511462658851759e-05, |
|
"loss": 0.0002, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 9.951417004048583, |
|
"grad_norm": 0.0037457100115716457, |
|
"learning_rate": 7.50655224440836e-05, |
|
"loss": 0.0004, |
|
"step": 1229 |
|
}, |
|
{ |
|
"epoch": 9.959514170040485, |
|
"grad_norm": 0.006001995410770178, |
|
"learning_rate": 7.501638598807359e-05, |
|
"loss": 0.0002, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 9.96761133603239, |
|
"grad_norm": 0.0024609274696558714, |
|
"learning_rate": 7.496721728382861e-05, |
|
"loss": 0.0002, |
|
"step": 1231 |
|
}, |
|
{ |
|
"epoch": 9.975708502024291, |
|
"grad_norm": 0.008431760594248772, |
|
"learning_rate": 7.491801639473127e-05, |
|
"loss": 0.0004, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 9.983805668016194, |
|
"grad_norm": 0.003166408510878682, |
|
"learning_rate": 7.486878338420567e-05, |
|
"loss": 0.0003, |
|
"step": 1233 |
|
}, |
|
{ |
|
"epoch": 9.991902834008098, |
|
"grad_norm": 0.005395290441811085, |
|
"learning_rate": 7.48195183157173e-05, |
|
"loss": 0.0003, |
|
"step": 1234 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.0012857104884460568, |
|
"learning_rate": 7.477022125277304e-05, |
|
"loss": 0.0003, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 10.008097165991902, |
|
"grad_norm": 0.0011820251820608974, |
|
"learning_rate": 7.472089225892093e-05, |
|
"loss": 0.0002, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 10.016194331983806, |
|
"grad_norm": 0.009494257159531116, |
|
"learning_rate": 7.467153139775022e-05, |
|
"loss": 0.0002, |
|
"step": 1237 |
|
}, |
|
{ |
|
"epoch": 10.024291497975709, |
|
"grad_norm": 0.000511787598952651, |
|
"learning_rate": 7.462213873289123e-05, |
|
"loss": 0.0002, |
|
"step": 1238 |
|
}, |
|
{ |
|
"epoch": 10.03238866396761, |
|
"grad_norm": 0.003136824816465378, |
|
"learning_rate": 7.457271432801531e-05, |
|
"loss": 0.0002, |
|
"step": 1239 |
|
}, |
|
{ |
|
"epoch": 10.040485829959515, |
|
"grad_norm": 0.0016326183686032891, |
|
"learning_rate": 7.452325824683463e-05, |
|
"loss": 0.0002, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 10.048582995951417, |
|
"grad_norm": 0.001595051260665059, |
|
"learning_rate": 7.447377055310231e-05, |
|
"loss": 0.0002, |
|
"step": 1241 |
|
}, |
|
{ |
|
"epoch": 10.05668016194332, |
|
"grad_norm": 0.0034364748280495405, |
|
"learning_rate": 7.442425131061215e-05, |
|
"loss": 0.0002, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 10.064777327935223, |
|
"grad_norm": 0.0006874504615552723, |
|
"learning_rate": 7.437470058319865e-05, |
|
"loss": 0.0002, |
|
"step": 1243 |
|
}, |
|
{ |
|
"epoch": 10.072874493927126, |
|
"grad_norm": 0.0012387243332341313, |
|
"learning_rate": 7.432511843473683e-05, |
|
"loss": 0.0002, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 10.080971659919028, |
|
"grad_norm": 0.005311995279043913, |
|
"learning_rate": 7.427550492914235e-05, |
|
"loss": 0.0004, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 10.089068825910932, |
|
"grad_norm": 0.007275078445672989, |
|
"learning_rate": 7.422586013037114e-05, |
|
"loss": 0.0002, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 10.097165991902834, |
|
"grad_norm": 0.0030083160381764174, |
|
"learning_rate": 7.417618410241959e-05, |
|
"loss": 0.0002, |
|
"step": 1247 |
|
}, |
|
{ |
|
"epoch": 10.105263157894736, |
|
"grad_norm": 0.004656744189560413, |
|
"learning_rate": 7.412647690932426e-05, |
|
"loss": 0.0003, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 10.11336032388664, |
|
"grad_norm": 0.0035645875614136457, |
|
"learning_rate": 7.407673861516195e-05, |
|
"loss": 0.0002, |
|
"step": 1249 |
|
}, |
|
{ |
|
"epoch": 10.121457489878543, |
|
"grad_norm": 0.0017369840061292052, |
|
"learning_rate": 7.402696928404951e-05, |
|
"loss": 0.0002, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 10.121457489878543, |
|
"eval_loss": 0.0013646668521687388, |
|
"eval_runtime": 20.8332, |
|
"eval_samples_per_second": 4.8, |
|
"eval_steps_per_second": 1.2, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 10.129554655870445, |
|
"grad_norm": 0.0007923658704385161, |
|
"learning_rate": 7.39771689801438e-05, |
|
"loss": 0.0002, |
|
"step": 1251 |
|
}, |
|
{ |
|
"epoch": 10.137651821862349, |
|
"grad_norm": 0.005313723348081112, |
|
"learning_rate": 7.392733776764164e-05, |
|
"loss": 0.0002, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 10.145748987854251, |
|
"grad_norm": 0.002177554415538907, |
|
"learning_rate": 7.387747571077966e-05, |
|
"loss": 0.0002, |
|
"step": 1253 |
|
}, |
|
{ |
|
"epoch": 10.153846153846153, |
|
"grad_norm": 0.0029326973017305136, |
|
"learning_rate": 7.382758287383426e-05, |
|
"loss": 0.0002, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 10.161943319838057, |
|
"grad_norm": 0.001368851400911808, |
|
"learning_rate": 7.377765932112157e-05, |
|
"loss": 0.0002, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 10.17004048582996, |
|
"grad_norm": 0.003802344435825944, |
|
"learning_rate": 7.372770511699719e-05, |
|
"loss": 0.0003, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 10.178137651821862, |
|
"grad_norm": 0.0012702594976872206, |
|
"learning_rate": 7.367772032585634e-05, |
|
"loss": 0.0002, |
|
"step": 1257 |
|
}, |
|
{ |
|
"epoch": 10.186234817813766, |
|
"grad_norm": 0.005360572598874569, |
|
"learning_rate": 7.362770501213367e-05, |
|
"loss": 0.0003, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 10.194331983805668, |
|
"grad_norm": 0.0019447492668405175, |
|
"learning_rate": 7.357765924030311e-05, |
|
"loss": 0.0002, |
|
"step": 1259 |
|
}, |
|
{ |
|
"epoch": 10.20242914979757, |
|
"grad_norm": 0.003229207592085004, |
|
"learning_rate": 7.352758307487788e-05, |
|
"loss": 0.0003, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 10.210526315789474, |
|
"grad_norm": 0.0026793682482093573, |
|
"learning_rate": 7.347747658041043e-05, |
|
"loss": 0.0002, |
|
"step": 1261 |
|
}, |
|
{ |
|
"epoch": 10.218623481781377, |
|
"grad_norm": 0.0010839339811354876, |
|
"learning_rate": 7.342733982149223e-05, |
|
"loss": 0.0002, |
|
"step": 1262 |
|
}, |
|
{ |
|
"epoch": 10.226720647773279, |
|
"grad_norm": 0.004595485050231218, |
|
"learning_rate": 7.33771728627538e-05, |
|
"loss": 0.0003, |
|
"step": 1263 |
|
}, |
|
{ |
|
"epoch": 10.234817813765183, |
|
"grad_norm": 0.0012352195335552096, |
|
"learning_rate": 7.332697576886462e-05, |
|
"loss": 0.0002, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 10.242914979757085, |
|
"grad_norm": 0.004273899365216494, |
|
"learning_rate": 7.327674860453296e-05, |
|
"loss": 0.0002, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 10.251012145748987, |
|
"grad_norm": 0.005454242695122957, |
|
"learning_rate": 7.322649143450585e-05, |
|
"loss": 0.0003, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 10.259109311740891, |
|
"grad_norm": 0.0013370011001825333, |
|
"learning_rate": 7.317620432356907e-05, |
|
"loss": 0.0002, |
|
"step": 1267 |
|
}, |
|
{ |
|
"epoch": 10.267206477732794, |
|
"grad_norm": 0.00569200748577714, |
|
"learning_rate": 7.312588733654693e-05, |
|
"loss": 0.0002, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 10.275303643724696, |
|
"grad_norm": 0.0030299886129796505, |
|
"learning_rate": 7.307554053830232e-05, |
|
"loss": 0.0002, |
|
"step": 1269 |
|
}, |
|
{ |
|
"epoch": 10.2834008097166, |
|
"grad_norm": 0.0035660641733556986, |
|
"learning_rate": 7.302516399373645e-05, |
|
"loss": 0.0002, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 10.291497975708502, |
|
"grad_norm": 0.0019571082666516304, |
|
"learning_rate": 7.2974757767789e-05, |
|
"loss": 0.0002, |
|
"step": 1271 |
|
}, |
|
{ |
|
"epoch": 10.299595141700404, |
|
"grad_norm": 0.0020669170189648867, |
|
"learning_rate": 7.292432192543783e-05, |
|
"loss": 0.0002, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 10.307692307692308, |
|
"grad_norm": 0.0015401261625811458, |
|
"learning_rate": 7.287385653169898e-05, |
|
"loss": 0.0002, |
|
"step": 1273 |
|
}, |
|
{ |
|
"epoch": 10.31578947368421, |
|
"grad_norm": 0.0020590811036527157, |
|
"learning_rate": 7.282336165162665e-05, |
|
"loss": 0.0003, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 10.323886639676113, |
|
"grad_norm": 0.0030925278551876545, |
|
"learning_rate": 7.277283735031298e-05, |
|
"loss": 0.0002, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 10.323886639676113, |
|
"eval_loss": 0.0014393809251487255, |
|
"eval_runtime": 20.8439, |
|
"eval_samples_per_second": 4.798, |
|
"eval_steps_per_second": 1.199, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 10.331983805668017, |
|
"grad_norm": 0.002777695655822754, |
|
"learning_rate": 7.272228369288806e-05, |
|
"loss": 0.0002, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 10.34008097165992, |
|
"grad_norm": 0.0012481295270845294, |
|
"learning_rate": 7.267170074451983e-05, |
|
"loss": 0.0002, |
|
"step": 1277 |
|
}, |
|
{ |
|
"epoch": 10.348178137651821, |
|
"grad_norm": 0.0015662169316783547, |
|
"learning_rate": 7.262108857041399e-05, |
|
"loss": 0.0002, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 10.356275303643725, |
|
"grad_norm": 0.003938265610486269, |
|
"learning_rate": 7.257044723581391e-05, |
|
"loss": 0.0002, |
|
"step": 1279 |
|
}, |
|
{ |
|
"epoch": 10.364372469635628, |
|
"grad_norm": 0.0015693637542426586, |
|
"learning_rate": 7.251977680600053e-05, |
|
"loss": 0.0002, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 10.37246963562753, |
|
"grad_norm": 0.002020388375967741, |
|
"learning_rate": 7.246907734629233e-05, |
|
"loss": 0.0002, |
|
"step": 1281 |
|
}, |
|
{ |
|
"epoch": 10.380566801619434, |
|
"grad_norm": 0.00170166976749897, |
|
"learning_rate": 7.24183489220452e-05, |
|
"loss": 0.0002, |
|
"step": 1282 |
|
}, |
|
{ |
|
"epoch": 10.388663967611336, |
|
"grad_norm": 0.002372957533225417, |
|
"learning_rate": 7.236759159865236e-05, |
|
"loss": 0.0002, |
|
"step": 1283 |
|
}, |
|
{ |
|
"epoch": 10.396761133603238, |
|
"grad_norm": 0.004695139825344086, |
|
"learning_rate": 7.231680544154427e-05, |
|
"loss": 0.0002, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 10.404858299595142, |
|
"grad_norm": 0.002594963414594531, |
|
"learning_rate": 7.226599051618863e-05, |
|
"loss": 0.0002, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 10.412955465587045, |
|
"grad_norm": 0.0015460449503734708, |
|
"learning_rate": 7.22151468880901e-05, |
|
"loss": 0.0002, |
|
"step": 1286 |
|
}, |
|
{ |
|
"epoch": 10.421052631578947, |
|
"grad_norm": 0.005851665511727333, |
|
"learning_rate": 7.216427462279047e-05, |
|
"loss": 0.0002, |
|
"step": 1287 |
|
}, |
|
{ |
|
"epoch": 10.429149797570851, |
|
"grad_norm": 0.0015737306093797088, |
|
"learning_rate": 7.211337378586835e-05, |
|
"loss": 0.0002, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 10.437246963562753, |
|
"grad_norm": 0.003812261624261737, |
|
"learning_rate": 7.206244444293925e-05, |
|
"loss": 0.0002, |
|
"step": 1289 |
|
}, |
|
{ |
|
"epoch": 10.445344129554655, |
|
"grad_norm": 0.0015756061766296625, |
|
"learning_rate": 7.201148665965536e-05, |
|
"loss": 0.0002, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 10.45344129554656, |
|
"grad_norm": 0.005322249606251717, |
|
"learning_rate": 7.196050050170561e-05, |
|
"loss": 0.0002, |
|
"step": 1291 |
|
}, |
|
{ |
|
"epoch": 10.461538461538462, |
|
"grad_norm": 0.0012198768090456724, |
|
"learning_rate": 7.190948603481543e-05, |
|
"loss": 0.0002, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 10.469635627530364, |
|
"grad_norm": 0.002762486459687352, |
|
"learning_rate": 7.185844332474679e-05, |
|
"loss": 0.0003, |
|
"step": 1293 |
|
}, |
|
{ |
|
"epoch": 10.477732793522268, |
|
"grad_norm": 0.0013679059920832515, |
|
"learning_rate": 7.180737243729804e-05, |
|
"loss": 0.0002, |
|
"step": 1294 |
|
}, |
|
{ |
|
"epoch": 10.48582995951417, |
|
"grad_norm": 0.0009709448786452413, |
|
"learning_rate": 7.175627343830392e-05, |
|
"loss": 0.0002, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 10.493927125506072, |
|
"grad_norm": 0.0033552187960594893, |
|
"learning_rate": 7.17051463936353e-05, |
|
"loss": 0.0003, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 10.502024291497976, |
|
"grad_norm": 0.001450160052627325, |
|
"learning_rate": 7.16539913691993e-05, |
|
"loss": 0.0002, |
|
"step": 1297 |
|
}, |
|
{ |
|
"epoch": 10.510121457489879, |
|
"grad_norm": 0.0011616905685514212, |
|
"learning_rate": 7.160280843093902e-05, |
|
"loss": 0.0002, |
|
"step": 1298 |
|
}, |
|
{ |
|
"epoch": 10.518218623481781, |
|
"grad_norm": 0.0014929898316040635, |
|
"learning_rate": 7.155159764483364e-05, |
|
"loss": 0.0002, |
|
"step": 1299 |
|
}, |
|
{ |
|
"epoch": 10.526315789473685, |
|
"grad_norm": 0.0010820671450346708, |
|
"learning_rate": 7.150035907689816e-05, |
|
"loss": 0.0002, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 10.526315789473685, |
|
"eval_loss": 0.0013635024661198258, |
|
"eval_runtime": 20.8752, |
|
"eval_samples_per_second": 4.79, |
|
"eval_steps_per_second": 1.198, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 10.534412955465587, |
|
"grad_norm": 0.002717435359954834, |
|
"learning_rate": 7.144909279318344e-05, |
|
"loss": 0.0002, |
|
"step": 1301 |
|
}, |
|
{ |
|
"epoch": 10.54251012145749, |
|
"grad_norm": 0.004127759486436844, |
|
"learning_rate": 7.139779885977604e-05, |
|
"loss": 0.0003, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 10.550607287449393, |
|
"grad_norm": 0.0019464613869786263, |
|
"learning_rate": 7.134647734279817e-05, |
|
"loss": 0.0002, |
|
"step": 1303 |
|
}, |
|
{ |
|
"epoch": 10.558704453441296, |
|
"grad_norm": 0.002114477101713419, |
|
"learning_rate": 7.129512830840763e-05, |
|
"loss": 0.0002, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 10.566801619433198, |
|
"grad_norm": 0.0025543624069541693, |
|
"learning_rate": 7.124375182279762e-05, |
|
"loss": 0.0003, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 10.574898785425102, |
|
"grad_norm": 0.0009003437007777393, |
|
"learning_rate": 7.11923479521968e-05, |
|
"loss": 0.0002, |
|
"step": 1306 |
|
}, |
|
{ |
|
"epoch": 10.582995951417004, |
|
"grad_norm": 0.002608225215226412, |
|
"learning_rate": 7.11409167628691e-05, |
|
"loss": 0.0002, |
|
"step": 1307 |
|
}, |
|
{ |
|
"epoch": 10.591093117408906, |
|
"grad_norm": 0.004385102540254593, |
|
"learning_rate": 7.108945832111366e-05, |
|
"loss": 0.0003, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 10.59919028340081, |
|
"grad_norm": 0.002555650193244219, |
|
"learning_rate": 7.103797269326475e-05, |
|
"loss": 0.0002, |
|
"step": 1309 |
|
}, |
|
{ |
|
"epoch": 10.607287449392713, |
|
"grad_norm": 0.00154935906175524, |
|
"learning_rate": 7.098645994569171e-05, |
|
"loss": 0.0002, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 10.615384615384615, |
|
"grad_norm": 0.0027549327351152897, |
|
"learning_rate": 7.093492014479884e-05, |
|
"loss": 0.0002, |
|
"step": 1311 |
|
}, |
|
{ |
|
"epoch": 10.623481781376519, |
|
"grad_norm": 0.0012162472121417522, |
|
"learning_rate": 7.088335335702525e-05, |
|
"loss": 0.0002, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 10.631578947368421, |
|
"grad_norm": 0.0018456067191436887, |
|
"learning_rate": 7.083175964884491e-05, |
|
"loss": 0.0002, |
|
"step": 1313 |
|
}, |
|
{ |
|
"epoch": 10.639676113360323, |
|
"grad_norm": 0.0008041391847655177, |
|
"learning_rate": 7.078013908676649e-05, |
|
"loss": 0.0002, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 10.647773279352228, |
|
"grad_norm": 0.0049199191853404045, |
|
"learning_rate": 7.072849173733323e-05, |
|
"loss": 0.0002, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 10.65587044534413, |
|
"grad_norm": 0.0091862166300416, |
|
"learning_rate": 7.067681766712293e-05, |
|
"loss": 0.0004, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 10.663967611336032, |
|
"grad_norm": 0.0005279682809486985, |
|
"learning_rate": 7.062511694274783e-05, |
|
"loss": 0.0002, |
|
"step": 1317 |
|
}, |
|
{ |
|
"epoch": 10.672064777327936, |
|
"grad_norm": 0.003007502295076847, |
|
"learning_rate": 7.057338963085453e-05, |
|
"loss": 0.0003, |
|
"step": 1318 |
|
}, |
|
{ |
|
"epoch": 10.680161943319838, |
|
"grad_norm": 0.0021479730494320393, |
|
"learning_rate": 7.052163579812393e-05, |
|
"loss": 0.0002, |
|
"step": 1319 |
|
}, |
|
{ |
|
"epoch": 10.68825910931174, |
|
"grad_norm": 0.0024797257501631975, |
|
"learning_rate": 7.046985551127106e-05, |
|
"loss": 0.0002, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 10.696356275303645, |
|
"grad_norm": 0.0019876586738973856, |
|
"learning_rate": 7.04180488370451e-05, |
|
"loss": 0.0002, |
|
"step": 1321 |
|
}, |
|
{ |
|
"epoch": 10.704453441295547, |
|
"grad_norm": 0.002082040999084711, |
|
"learning_rate": 7.036621584222925e-05, |
|
"loss": 0.0002, |
|
"step": 1322 |
|
}, |
|
{ |
|
"epoch": 10.712550607287449, |
|
"grad_norm": 0.0060581970028579235, |
|
"learning_rate": 7.031435659364057e-05, |
|
"loss": 0.0004, |
|
"step": 1323 |
|
}, |
|
{ |
|
"epoch": 10.720647773279353, |
|
"grad_norm": 0.0017555778613314033, |
|
"learning_rate": 7.026247115813003e-05, |
|
"loss": 0.0002, |
|
"step": 1324 |
|
}, |
|
{ |
|
"epoch": 10.728744939271255, |
|
"grad_norm": 0.003592435270547867, |
|
"learning_rate": 7.021055960258239e-05, |
|
"loss": 0.0002, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 10.728744939271255, |
|
"eval_loss": 0.0015407928731292486, |
|
"eval_runtime": 20.8476, |
|
"eval_samples_per_second": 4.797, |
|
"eval_steps_per_second": 1.199, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 10.728744939271255, |
|
"step": 1325, |
|
"total_flos": 3.261064452032889e+18, |
|
"train_loss": 0.0019915386885712098, |
|
"train_runtime": 25159.8869, |
|
"train_samples_per_second": 3.927, |
|
"train_steps_per_second": 0.122 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 3075, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 25, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 4 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.261064452032889e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|