reliance / trainer_state.json
sizhkhy's picture
Upload folder using huggingface_hub
9cb8ac7 verified
{
"best_metric": 0.001362333190627396,
"best_model_checkpoint": "/home/paperspace/Data/models/reliance/llm3br256/checkpoint-1200",
"epoch": 10.728744939271255,
"eval_steps": 25,
"global_step": 1325,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008097165991902834,
"grad_norm": 0.08275424689054489,
"learning_rate": 3.246753246753247e-07,
"loss": 0.0308,
"step": 1
},
{
"epoch": 0.016194331983805668,
"grad_norm": 0.07216893136501312,
"learning_rate": 6.493506493506494e-07,
"loss": 0.023,
"step": 2
},
{
"epoch": 0.024291497975708502,
"grad_norm": 0.07287438213825226,
"learning_rate": 9.74025974025974e-07,
"loss": 0.0238,
"step": 3
},
{
"epoch": 0.032388663967611336,
"grad_norm": 0.08366404473781586,
"learning_rate": 1.2987012987012988e-06,
"loss": 0.0231,
"step": 4
},
{
"epoch": 0.04048582995951417,
"grad_norm": 0.06945216655731201,
"learning_rate": 1.6233766233766232e-06,
"loss": 0.0274,
"step": 5
},
{
"epoch": 0.048582995951417005,
"grad_norm": 0.0757410079240799,
"learning_rate": 1.948051948051948e-06,
"loss": 0.0254,
"step": 6
},
{
"epoch": 0.05668016194331984,
"grad_norm": 0.07382987439632416,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.0265,
"step": 7
},
{
"epoch": 0.06477732793522267,
"grad_norm": 0.06420081108808517,
"learning_rate": 2.5974025974025976e-06,
"loss": 0.0253,
"step": 8
},
{
"epoch": 0.0728744939271255,
"grad_norm": 0.06388016045093536,
"learning_rate": 2.922077922077922e-06,
"loss": 0.0246,
"step": 9
},
{
"epoch": 0.08097165991902834,
"grad_norm": 0.06586486101150513,
"learning_rate": 3.2467532467532465e-06,
"loss": 0.0237,
"step": 10
},
{
"epoch": 0.08906882591093117,
"grad_norm": 0.057881902903318405,
"learning_rate": 3.5714285714285714e-06,
"loss": 0.0223,
"step": 11
},
{
"epoch": 0.09716599190283401,
"grad_norm": 0.0731276124715805,
"learning_rate": 3.896103896103896e-06,
"loss": 0.0221,
"step": 12
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.04406806081533432,
"learning_rate": 4.220779220779221e-06,
"loss": 0.0195,
"step": 13
},
{
"epoch": 0.11336032388663968,
"grad_norm": 0.0436287596821785,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.0168,
"step": 14
},
{
"epoch": 0.1214574898785425,
"grad_norm": 0.047254908829927444,
"learning_rate": 4.870129870129871e-06,
"loss": 0.0224,
"step": 15
},
{
"epoch": 0.12955465587044535,
"grad_norm": 0.041482556611299515,
"learning_rate": 5.194805194805195e-06,
"loss": 0.0193,
"step": 16
},
{
"epoch": 0.13765182186234817,
"grad_norm": 0.04704615846276283,
"learning_rate": 5.51948051948052e-06,
"loss": 0.0219,
"step": 17
},
{
"epoch": 0.145748987854251,
"grad_norm": 0.04699448496103287,
"learning_rate": 5.844155844155844e-06,
"loss": 0.0218,
"step": 18
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.049021102488040924,
"learning_rate": 6.168831168831169e-06,
"loss": 0.0243,
"step": 19
},
{
"epoch": 0.16194331983805668,
"grad_norm": 0.03877793252468109,
"learning_rate": 6.493506493506493e-06,
"loss": 0.0167,
"step": 20
},
{
"epoch": 0.1700404858299595,
"grad_norm": 0.041388873010873795,
"learning_rate": 6.818181818181818e-06,
"loss": 0.0172,
"step": 21
},
{
"epoch": 0.17813765182186234,
"grad_norm": 0.04476911574602127,
"learning_rate": 7.142857142857143e-06,
"loss": 0.0215,
"step": 22
},
{
"epoch": 0.1862348178137652,
"grad_norm": 0.03552476316690445,
"learning_rate": 7.467532467532468e-06,
"loss": 0.0179,
"step": 23
},
{
"epoch": 0.19433198380566802,
"grad_norm": 0.03406437113881111,
"learning_rate": 7.792207792207792e-06,
"loss": 0.0207,
"step": 24
},
{
"epoch": 0.20242914979757085,
"grad_norm": 0.030436363071203232,
"learning_rate": 8.116883116883117e-06,
"loss": 0.0197,
"step": 25
},
{
"epoch": 0.20242914979757085,
"eval_loss": 0.017658039927482605,
"eval_runtime": 22.7487,
"eval_samples_per_second": 4.396,
"eval_steps_per_second": 1.099,
"step": 25
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.030306411907076836,
"learning_rate": 8.441558441558442e-06,
"loss": 0.0148,
"step": 26
},
{
"epoch": 0.21862348178137653,
"grad_norm": 0.02774702198803425,
"learning_rate": 8.766233766233767e-06,
"loss": 0.0168,
"step": 27
},
{
"epoch": 0.22672064777327935,
"grad_norm": 0.026585258543491364,
"learning_rate": 9.090909090909091e-06,
"loss": 0.0115,
"step": 28
},
{
"epoch": 0.23481781376518218,
"grad_norm": 0.026557059958577156,
"learning_rate": 9.415584415584416e-06,
"loss": 0.0154,
"step": 29
},
{
"epoch": 0.242914979757085,
"grad_norm": 0.026998251676559448,
"learning_rate": 9.740259740259742e-06,
"loss": 0.014,
"step": 30
},
{
"epoch": 0.25101214574898784,
"grad_norm": 0.027236543595790863,
"learning_rate": 1.0064935064935065e-05,
"loss": 0.0152,
"step": 31
},
{
"epoch": 0.2591093117408907,
"grad_norm": 0.029114605858922005,
"learning_rate": 1.038961038961039e-05,
"loss": 0.0157,
"step": 32
},
{
"epoch": 0.26720647773279355,
"grad_norm": 0.02437474951148033,
"learning_rate": 1.0714285714285714e-05,
"loss": 0.0163,
"step": 33
},
{
"epoch": 0.27530364372469635,
"grad_norm": 0.023844681680202484,
"learning_rate": 1.103896103896104e-05,
"loss": 0.0147,
"step": 34
},
{
"epoch": 0.2834008097165992,
"grad_norm": 0.021733107045292854,
"learning_rate": 1.1363636363636365e-05,
"loss": 0.0164,
"step": 35
},
{
"epoch": 0.291497975708502,
"grad_norm": 0.022121932357549667,
"learning_rate": 1.1688311688311688e-05,
"loss": 0.0142,
"step": 36
},
{
"epoch": 0.29959514170040485,
"grad_norm": 0.020241033285856247,
"learning_rate": 1.2012987012987014e-05,
"loss": 0.015,
"step": 37
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.02085702121257782,
"learning_rate": 1.2337662337662339e-05,
"loss": 0.0147,
"step": 38
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.023749109357595444,
"learning_rate": 1.2662337662337662e-05,
"loss": 0.0156,
"step": 39
},
{
"epoch": 0.32388663967611336,
"grad_norm": 0.02149099111557007,
"learning_rate": 1.2987012987012986e-05,
"loss": 0.0125,
"step": 40
},
{
"epoch": 0.3319838056680162,
"grad_norm": 0.020449506118893623,
"learning_rate": 1.3311688311688311e-05,
"loss": 0.0107,
"step": 41
},
{
"epoch": 0.340080971659919,
"grad_norm": 0.020927896723151207,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.0119,
"step": 42
},
{
"epoch": 0.3481781376518219,
"grad_norm": 0.018237633630633354,
"learning_rate": 1.396103896103896e-05,
"loss": 0.0109,
"step": 43
},
{
"epoch": 0.3562753036437247,
"grad_norm": 0.019094541668891907,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.0113,
"step": 44
},
{
"epoch": 0.3643724696356275,
"grad_norm": 0.020349925383925438,
"learning_rate": 1.461038961038961e-05,
"loss": 0.018,
"step": 45
},
{
"epoch": 0.3724696356275304,
"grad_norm": 0.017563968896865845,
"learning_rate": 1.4935064935064936e-05,
"loss": 0.0105,
"step": 46
},
{
"epoch": 0.3805668016194332,
"grad_norm": 0.020637603476643562,
"learning_rate": 1.525974025974026e-05,
"loss": 0.0111,
"step": 47
},
{
"epoch": 0.38866396761133604,
"grad_norm": 0.01847653090953827,
"learning_rate": 1.5584415584415583e-05,
"loss": 0.0104,
"step": 48
},
{
"epoch": 0.3967611336032389,
"grad_norm": 0.019373638555407524,
"learning_rate": 1.590909090909091e-05,
"loss": 0.0122,
"step": 49
},
{
"epoch": 0.4048582995951417,
"grad_norm": 0.01981317810714245,
"learning_rate": 1.6233766233766234e-05,
"loss": 0.0118,
"step": 50
},
{
"epoch": 0.4048582995951417,
"eval_loss": 0.011365661397576332,
"eval_runtime": 20.8684,
"eval_samples_per_second": 4.792,
"eval_steps_per_second": 1.198,
"step": 50
},
{
"epoch": 0.41295546558704455,
"grad_norm": 0.024859532713890076,
"learning_rate": 1.655844155844156e-05,
"loss": 0.013,
"step": 51
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.02114563249051571,
"learning_rate": 1.6883116883116884e-05,
"loss": 0.0101,
"step": 52
},
{
"epoch": 0.4291497975708502,
"grad_norm": 0.017897600308060646,
"learning_rate": 1.7207792207792208e-05,
"loss": 0.0102,
"step": 53
},
{
"epoch": 0.43724696356275305,
"grad_norm": 0.0172622948884964,
"learning_rate": 1.7532467532467535e-05,
"loss": 0.009,
"step": 54
},
{
"epoch": 0.44534412955465585,
"grad_norm": 0.01602226495742798,
"learning_rate": 1.785714285714286e-05,
"loss": 0.0094,
"step": 55
},
{
"epoch": 0.4534412955465587,
"grad_norm": 0.018682394176721573,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.0101,
"step": 56
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.016763649880886078,
"learning_rate": 1.850649350649351e-05,
"loss": 0.0091,
"step": 57
},
{
"epoch": 0.46963562753036436,
"grad_norm": 0.021187469363212585,
"learning_rate": 1.8831168831168833e-05,
"loss": 0.0085,
"step": 58
},
{
"epoch": 0.4777327935222672,
"grad_norm": 0.01601949706673622,
"learning_rate": 1.9155844155844156e-05,
"loss": 0.0069,
"step": 59
},
{
"epoch": 0.48582995951417,
"grad_norm": 0.012528536841273308,
"learning_rate": 1.9480519480519483e-05,
"loss": 0.0083,
"step": 60
},
{
"epoch": 0.4939271255060729,
"grad_norm": 0.019854655489325523,
"learning_rate": 1.9805194805194807e-05,
"loss": 0.0118,
"step": 61
},
{
"epoch": 0.5020242914979757,
"grad_norm": 0.016604121774435043,
"learning_rate": 2.012987012987013e-05,
"loss": 0.0078,
"step": 62
},
{
"epoch": 0.5101214574898786,
"grad_norm": 0.017011208459734917,
"learning_rate": 2.0454545454545457e-05,
"loss": 0.0096,
"step": 63
},
{
"epoch": 0.5182186234817814,
"grad_norm": 0.017113033682107925,
"learning_rate": 2.077922077922078e-05,
"loss": 0.0121,
"step": 64
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.014709901064634323,
"learning_rate": 2.1103896103896105e-05,
"loss": 0.0066,
"step": 65
},
{
"epoch": 0.5344129554655871,
"grad_norm": 0.01747279427945614,
"learning_rate": 2.1428571428571428e-05,
"loss": 0.0071,
"step": 66
},
{
"epoch": 0.5425101214574899,
"grad_norm": 0.01678309217095375,
"learning_rate": 2.1753246753246752e-05,
"loss": 0.0061,
"step": 67
},
{
"epoch": 0.5506072874493927,
"grad_norm": 0.014886225573718548,
"learning_rate": 2.207792207792208e-05,
"loss": 0.007,
"step": 68
},
{
"epoch": 0.5587044534412956,
"grad_norm": 0.017061002552509308,
"learning_rate": 2.2402597402597402e-05,
"loss": 0.0094,
"step": 69
},
{
"epoch": 0.5668016194331984,
"grad_norm": 0.014715418219566345,
"learning_rate": 2.272727272727273e-05,
"loss": 0.0066,
"step": 70
},
{
"epoch": 0.5748987854251012,
"grad_norm": 0.018518812954425812,
"learning_rate": 2.3051948051948053e-05,
"loss": 0.01,
"step": 71
},
{
"epoch": 0.582995951417004,
"grad_norm": 0.020052259787917137,
"learning_rate": 2.3376623376623376e-05,
"loss": 0.011,
"step": 72
},
{
"epoch": 0.5910931174089069,
"grad_norm": 0.01645250990986824,
"learning_rate": 2.3701298701298703e-05,
"loss": 0.0052,
"step": 73
},
{
"epoch": 0.5991902834008097,
"grad_norm": 0.015539892017841339,
"learning_rate": 2.4025974025974027e-05,
"loss": 0.0096,
"step": 74
},
{
"epoch": 0.6072874493927125,
"grad_norm": 0.017328433692455292,
"learning_rate": 2.435064935064935e-05,
"loss": 0.0103,
"step": 75
},
{
"epoch": 0.6072874493927125,
"eval_loss": 0.007956410758197308,
"eval_runtime": 20.8871,
"eval_samples_per_second": 4.788,
"eval_steps_per_second": 1.197,
"step": 75
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.016708405688405037,
"learning_rate": 2.4675324675324678e-05,
"loss": 0.0079,
"step": 76
},
{
"epoch": 0.6234817813765182,
"grad_norm": 0.018906861543655396,
"learning_rate": 2.5e-05,
"loss": 0.0072,
"step": 77
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.017756767570972443,
"learning_rate": 2.5324675324675325e-05,
"loss": 0.0094,
"step": 78
},
{
"epoch": 0.6396761133603239,
"grad_norm": 0.016792573034763336,
"learning_rate": 2.5649350649350652e-05,
"loss": 0.0085,
"step": 79
},
{
"epoch": 0.6477732793522267,
"grad_norm": 0.016278818249702454,
"learning_rate": 2.5974025974025972e-05,
"loss": 0.0057,
"step": 80
},
{
"epoch": 0.6558704453441295,
"grad_norm": 0.015400170348584652,
"learning_rate": 2.62987012987013e-05,
"loss": 0.0075,
"step": 81
},
{
"epoch": 0.6639676113360324,
"grad_norm": 0.012865799479186535,
"learning_rate": 2.6623376623376623e-05,
"loss": 0.0063,
"step": 82
},
{
"epoch": 0.6720647773279352,
"grad_norm": 0.014955022372305393,
"learning_rate": 2.694805194805195e-05,
"loss": 0.0093,
"step": 83
},
{
"epoch": 0.680161943319838,
"grad_norm": 0.015082084573805332,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.0051,
"step": 84
},
{
"epoch": 0.6882591093117408,
"grad_norm": 0.01421983353793621,
"learning_rate": 2.75974025974026e-05,
"loss": 0.0056,
"step": 85
},
{
"epoch": 0.6963562753036437,
"grad_norm": 0.017437629401683807,
"learning_rate": 2.792207792207792e-05,
"loss": 0.0075,
"step": 86
},
{
"epoch": 0.7044534412955465,
"grad_norm": 0.19036760926246643,
"learning_rate": 2.824675324675325e-05,
"loss": 0.0085,
"step": 87
},
{
"epoch": 0.7125506072874493,
"grad_norm": 0.013543471693992615,
"learning_rate": 2.857142857142857e-05,
"loss": 0.0055,
"step": 88
},
{
"epoch": 0.7206477732793523,
"grad_norm": 0.029237190261483192,
"learning_rate": 2.8896103896103898e-05,
"loss": 0.0049,
"step": 89
},
{
"epoch": 0.728744939271255,
"grad_norm": 0.017158357426524162,
"learning_rate": 2.922077922077922e-05,
"loss": 0.0061,
"step": 90
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.01913885958492756,
"learning_rate": 2.954545454545455e-05,
"loss": 0.0074,
"step": 91
},
{
"epoch": 0.7449392712550608,
"grad_norm": 0.037916868925094604,
"learning_rate": 2.9870129870129872e-05,
"loss": 0.0081,
"step": 92
},
{
"epoch": 0.7530364372469636,
"grad_norm": 0.018052248284220695,
"learning_rate": 3.01948051948052e-05,
"loss": 0.0075,
"step": 93
},
{
"epoch": 0.7611336032388664,
"grad_norm": 0.01774253509938717,
"learning_rate": 3.051948051948052e-05,
"loss": 0.0091,
"step": 94
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.019465815275907516,
"learning_rate": 3.084415584415585e-05,
"loss": 0.0083,
"step": 95
},
{
"epoch": 0.7773279352226721,
"grad_norm": 0.01778685301542282,
"learning_rate": 3.1168831168831166e-05,
"loss": 0.0061,
"step": 96
},
{
"epoch": 0.7854251012145749,
"grad_norm": 0.017645837739109993,
"learning_rate": 3.14935064935065e-05,
"loss": 0.0088,
"step": 97
},
{
"epoch": 0.7935222672064778,
"grad_norm": 0.013044299557805061,
"learning_rate": 3.181818181818182e-05,
"loss": 0.0046,
"step": 98
},
{
"epoch": 0.8016194331983806,
"grad_norm": 0.015588215552270412,
"learning_rate": 3.2142857142857144e-05,
"loss": 0.0048,
"step": 99
},
{
"epoch": 0.8097165991902834,
"grad_norm": 0.014032929204404354,
"learning_rate": 3.246753246753247e-05,
"loss": 0.0091,
"step": 100
},
{
"epoch": 0.8097165991902834,
"eval_loss": 0.007168960757553577,
"eval_runtime": 20.8882,
"eval_samples_per_second": 4.787,
"eval_steps_per_second": 1.197,
"step": 100
},
{
"epoch": 0.8178137651821862,
"grad_norm": 0.013353945687413216,
"learning_rate": 3.27922077922078e-05,
"loss": 0.0049,
"step": 101
},
{
"epoch": 0.8259109311740891,
"grad_norm": 0.012625321745872498,
"learning_rate": 3.311688311688312e-05,
"loss": 0.0045,
"step": 102
},
{
"epoch": 0.8340080971659919,
"grad_norm": 0.01578591763973236,
"learning_rate": 3.344155844155844e-05,
"loss": 0.0064,
"step": 103
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.02107596956193447,
"learning_rate": 3.376623376623377e-05,
"loss": 0.0111,
"step": 104
},
{
"epoch": 0.8502024291497976,
"grad_norm": 0.014094969257712364,
"learning_rate": 3.409090909090909e-05,
"loss": 0.0043,
"step": 105
},
{
"epoch": 0.8582995951417004,
"grad_norm": 0.01773056574165821,
"learning_rate": 3.4415584415584416e-05,
"loss": 0.0065,
"step": 106
},
{
"epoch": 0.8663967611336032,
"grad_norm": 0.01486600749194622,
"learning_rate": 3.474025974025974e-05,
"loss": 0.0052,
"step": 107
},
{
"epoch": 0.8744939271255061,
"grad_norm": 0.01461310125887394,
"learning_rate": 3.506493506493507e-05,
"loss": 0.0037,
"step": 108
},
{
"epoch": 0.8825910931174089,
"grad_norm": 0.0219147726893425,
"learning_rate": 3.5389610389610387e-05,
"loss": 0.007,
"step": 109
},
{
"epoch": 0.8906882591093117,
"grad_norm": 0.01585337519645691,
"learning_rate": 3.571428571428572e-05,
"loss": 0.004,
"step": 110
},
{
"epoch": 0.8987854251012146,
"grad_norm": 0.01616801507771015,
"learning_rate": 3.603896103896104e-05,
"loss": 0.006,
"step": 111
},
{
"epoch": 0.9068825910931174,
"grad_norm": 0.015305282548069954,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.0045,
"step": 112
},
{
"epoch": 0.9149797570850202,
"grad_norm": 0.013390602543950081,
"learning_rate": 3.668831168831169e-05,
"loss": 0.0054,
"step": 113
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.016158539801836014,
"learning_rate": 3.701298701298702e-05,
"loss": 0.0053,
"step": 114
},
{
"epoch": 0.9311740890688259,
"grad_norm": 0.015498949214816093,
"learning_rate": 3.7337662337662335e-05,
"loss": 0.0044,
"step": 115
},
{
"epoch": 0.9392712550607287,
"grad_norm": 0.013625388033688068,
"learning_rate": 3.7662337662337665e-05,
"loss": 0.0045,
"step": 116
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.018029414117336273,
"learning_rate": 3.798701298701299e-05,
"loss": 0.0051,
"step": 117
},
{
"epoch": 0.9554655870445344,
"grad_norm": 0.018329549580812454,
"learning_rate": 3.831168831168831e-05,
"loss": 0.0078,
"step": 118
},
{
"epoch": 0.9635627530364372,
"grad_norm": 0.015500400215387344,
"learning_rate": 3.8636363636363636e-05,
"loss": 0.0036,
"step": 119
},
{
"epoch": 0.97165991902834,
"grad_norm": 0.01624232903122902,
"learning_rate": 3.8961038961038966e-05,
"loss": 0.0063,
"step": 120
},
{
"epoch": 0.979757085020243,
"grad_norm": 0.014512493275105953,
"learning_rate": 3.928571428571429e-05,
"loss": 0.0042,
"step": 121
},
{
"epoch": 0.9878542510121457,
"grad_norm": 0.018440047279000282,
"learning_rate": 3.9610389610389614e-05,
"loss": 0.0058,
"step": 122
},
{
"epoch": 0.9959514170040485,
"grad_norm": 0.011620835401117802,
"learning_rate": 3.993506493506494e-05,
"loss": 0.0029,
"step": 123
},
{
"epoch": 1.0040485829959513,
"grad_norm": 0.028106795623898506,
"learning_rate": 4.025974025974026e-05,
"loss": 0.012,
"step": 124
},
{
"epoch": 1.0121457489878543,
"grad_norm": 0.009489455260336399,
"learning_rate": 4.0584415584415584e-05,
"loss": 0.003,
"step": 125
},
{
"epoch": 1.0121457489878543,
"eval_loss": 0.0060044582933187485,
"eval_runtime": 20.8715,
"eval_samples_per_second": 4.791,
"eval_steps_per_second": 1.198,
"step": 125
},
{
"epoch": 1.0202429149797572,
"grad_norm": 0.01749836467206478,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.0087,
"step": 126
},
{
"epoch": 1.0283400809716599,
"grad_norm": 0.011480778455734253,
"learning_rate": 4.123376623376624e-05,
"loss": 0.0035,
"step": 127
},
{
"epoch": 1.0364372469635628,
"grad_norm": 0.012941240333020687,
"learning_rate": 4.155844155844156e-05,
"loss": 0.0053,
"step": 128
},
{
"epoch": 1.0445344129554657,
"grad_norm": 0.012464286759495735,
"learning_rate": 4.1883116883116886e-05,
"loss": 0.0041,
"step": 129
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.013026767410337925,
"learning_rate": 4.220779220779221e-05,
"loss": 0.0058,
"step": 130
},
{
"epoch": 1.0607287449392713,
"grad_norm": 0.014864835888147354,
"learning_rate": 4.253246753246753e-05,
"loss": 0.0037,
"step": 131
},
{
"epoch": 1.0688259109311742,
"grad_norm": 0.011576451361179352,
"learning_rate": 4.2857142857142856e-05,
"loss": 0.0038,
"step": 132
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.016221897676587105,
"learning_rate": 4.318181818181819e-05,
"loss": 0.005,
"step": 133
},
{
"epoch": 1.0850202429149798,
"grad_norm": 0.013863411732017994,
"learning_rate": 4.3506493506493503e-05,
"loss": 0.0052,
"step": 134
},
{
"epoch": 1.0931174089068827,
"grad_norm": 0.014415189623832703,
"learning_rate": 4.3831168831168834e-05,
"loss": 0.0041,
"step": 135
},
{
"epoch": 1.1012145748987854,
"grad_norm": 0.014737873338162899,
"learning_rate": 4.415584415584416e-05,
"loss": 0.0042,
"step": 136
},
{
"epoch": 1.1093117408906883,
"grad_norm": 0.015526373870670795,
"learning_rate": 4.448051948051948e-05,
"loss": 0.0029,
"step": 137
},
{
"epoch": 1.117408906882591,
"grad_norm": 0.014790773391723633,
"learning_rate": 4.4805194805194805e-05,
"loss": 0.0052,
"step": 138
},
{
"epoch": 1.125506072874494,
"grad_norm": 0.02353314682841301,
"learning_rate": 4.5129870129870135e-05,
"loss": 0.0093,
"step": 139
},
{
"epoch": 1.1336032388663968,
"grad_norm": 0.016826335340738297,
"learning_rate": 4.545454545454546e-05,
"loss": 0.0052,
"step": 140
},
{
"epoch": 1.1417004048582995,
"grad_norm": 0.014538138173520565,
"learning_rate": 4.577922077922078e-05,
"loss": 0.0055,
"step": 141
},
{
"epoch": 1.1497975708502024,
"grad_norm": 0.016404012218117714,
"learning_rate": 4.6103896103896106e-05,
"loss": 0.0044,
"step": 142
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.014474052004516125,
"learning_rate": 4.642857142857143e-05,
"loss": 0.0037,
"step": 143
},
{
"epoch": 1.165991902834008,
"grad_norm": 0.01470700092613697,
"learning_rate": 4.675324675324675e-05,
"loss": 0.0069,
"step": 144
},
{
"epoch": 1.174089068825911,
"grad_norm": 0.01384800300002098,
"learning_rate": 4.707792207792208e-05,
"loss": 0.0066,
"step": 145
},
{
"epoch": 1.1821862348178138,
"grad_norm": 0.012554049491882324,
"learning_rate": 4.740259740259741e-05,
"loss": 0.0044,
"step": 146
},
{
"epoch": 1.1902834008097165,
"grad_norm": 0.015297799371182919,
"learning_rate": 4.772727272727273e-05,
"loss": 0.0048,
"step": 147
},
{
"epoch": 1.1983805668016194,
"grad_norm": 0.013141577132046223,
"learning_rate": 4.8051948051948054e-05,
"loss": 0.0024,
"step": 148
},
{
"epoch": 1.2064777327935223,
"grad_norm": 0.013096342794597149,
"learning_rate": 4.8376623376623384e-05,
"loss": 0.0025,
"step": 149
},
{
"epoch": 1.214574898785425,
"grad_norm": 0.01221439242362976,
"learning_rate": 4.87012987012987e-05,
"loss": 0.0036,
"step": 150
},
{
"epoch": 1.214574898785425,
"eval_loss": 0.005298578180372715,
"eval_runtime": 20.8777,
"eval_samples_per_second": 4.79,
"eval_steps_per_second": 1.197,
"step": 150
},
{
"epoch": 1.222672064777328,
"grad_norm": 0.017413007095456123,
"learning_rate": 4.902597402597403e-05,
"loss": 0.0073,
"step": 151
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.020000923424959183,
"learning_rate": 4.9350649350649355e-05,
"loss": 0.0029,
"step": 152
},
{
"epoch": 1.2388663967611335,
"grad_norm": 0.013662380166351795,
"learning_rate": 4.967532467532468e-05,
"loss": 0.004,
"step": 153
},
{
"epoch": 1.2469635627530364,
"grad_norm": 0.013253867626190186,
"learning_rate": 5e-05,
"loss": 0.0029,
"step": 154
},
{
"epoch": 1.2550607287449393,
"grad_norm": 0.016671188175678253,
"learning_rate": 5.032467532467533e-05,
"loss": 0.0034,
"step": 155
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.012826805002987385,
"learning_rate": 5.064935064935065e-05,
"loss": 0.0026,
"step": 156
},
{
"epoch": 1.271255060728745,
"grad_norm": 0.016341542825102806,
"learning_rate": 5.097402597402597e-05,
"loss": 0.0046,
"step": 157
},
{
"epoch": 1.2793522267206479,
"grad_norm": 0.013105432502925396,
"learning_rate": 5.1298701298701304e-05,
"loss": 0.0028,
"step": 158
},
{
"epoch": 1.2874493927125505,
"grad_norm": 0.015593166463077068,
"learning_rate": 5.162337662337663e-05,
"loss": 0.0044,
"step": 159
},
{
"epoch": 1.2955465587044535,
"grad_norm": 0.017734261229634285,
"learning_rate": 5.1948051948051944e-05,
"loss": 0.0053,
"step": 160
},
{
"epoch": 1.3036437246963564,
"grad_norm": 0.013654530048370361,
"learning_rate": 5.2272727272727274e-05,
"loss": 0.0036,
"step": 161
},
{
"epoch": 1.311740890688259,
"grad_norm": 0.01586996205151081,
"learning_rate": 5.25974025974026e-05,
"loss": 0.0028,
"step": 162
},
{
"epoch": 1.319838056680162,
"grad_norm": 0.014020202681422234,
"learning_rate": 5.292207792207793e-05,
"loss": 0.0033,
"step": 163
},
{
"epoch": 1.3279352226720649,
"grad_norm": 0.014661739580333233,
"learning_rate": 5.3246753246753245e-05,
"loss": 0.0022,
"step": 164
},
{
"epoch": 1.3360323886639676,
"grad_norm": 0.015314622782170773,
"learning_rate": 5.3571428571428575e-05,
"loss": 0.0047,
"step": 165
},
{
"epoch": 1.3441295546558705,
"grad_norm": 0.016851790249347687,
"learning_rate": 5.38961038961039e-05,
"loss": 0.0029,
"step": 166
},
{
"epoch": 1.3522267206477734,
"grad_norm": 0.01127530261874199,
"learning_rate": 5.422077922077923e-05,
"loss": 0.0031,
"step": 167
},
{
"epoch": 1.360323886639676,
"grad_norm": 0.012864851392805576,
"learning_rate": 5.4545454545454546e-05,
"loss": 0.0026,
"step": 168
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.012660622596740723,
"learning_rate": 5.487012987012987e-05,
"loss": 0.0033,
"step": 169
},
{
"epoch": 1.376518218623482,
"grad_norm": 0.020632926374673843,
"learning_rate": 5.51948051948052e-05,
"loss": 0.005,
"step": 170
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.014771834947168827,
"learning_rate": 5.5519480519480524e-05,
"loss": 0.005,
"step": 171
},
{
"epoch": 1.3927125506072875,
"grad_norm": 0.014798545278608799,
"learning_rate": 5.584415584415584e-05,
"loss": 0.0018,
"step": 172
},
{
"epoch": 1.4008097165991904,
"grad_norm": 0.01847289875149727,
"learning_rate": 5.616883116883117e-05,
"loss": 0.0048,
"step": 173
},
{
"epoch": 1.408906882591093,
"grad_norm": 0.013270820491015911,
"learning_rate": 5.64935064935065e-05,
"loss": 0.0045,
"step": 174
},
{
"epoch": 1.417004048582996,
"grad_norm": 0.0156845785677433,
"learning_rate": 5.6818181818181825e-05,
"loss": 0.0052,
"step": 175
},
{
"epoch": 1.417004048582996,
"eval_loss": 0.004898196551948786,
"eval_runtime": 20.8955,
"eval_samples_per_second": 4.786,
"eval_steps_per_second": 1.196,
"step": 175
},
{
"epoch": 1.425101214574899,
"grad_norm": 0.0185660719871521,
"learning_rate": 5.714285714285714e-05,
"loss": 0.003,
"step": 176
},
{
"epoch": 1.4331983805668016,
"grad_norm": 0.016853397712111473,
"learning_rate": 5.746753246753247e-05,
"loss": 0.0033,
"step": 177
},
{
"epoch": 1.4412955465587045,
"grad_norm": 0.012373693287372589,
"learning_rate": 5.7792207792207796e-05,
"loss": 0.0027,
"step": 178
},
{
"epoch": 1.4493927125506074,
"grad_norm": 0.02164478786289692,
"learning_rate": 5.8116883116883126e-05,
"loss": 0.0039,
"step": 179
},
{
"epoch": 1.45748987854251,
"grad_norm": 0.019912002608180046,
"learning_rate": 5.844155844155844e-05,
"loss": 0.0041,
"step": 180
},
{
"epoch": 1.465587044534413,
"grad_norm": 0.011308755725622177,
"learning_rate": 5.8766233766233766e-05,
"loss": 0.0024,
"step": 181
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.014568260870873928,
"learning_rate": 5.90909090909091e-05,
"loss": 0.0016,
"step": 182
},
{
"epoch": 1.4817813765182186,
"grad_norm": 0.011573289521038532,
"learning_rate": 5.9415584415584414e-05,
"loss": 0.0022,
"step": 183
},
{
"epoch": 1.4898785425101215,
"grad_norm": 0.014651118777692318,
"learning_rate": 5.9740259740259744e-05,
"loss": 0.0051,
"step": 184
},
{
"epoch": 1.4979757085020242,
"grad_norm": 0.014680047519505024,
"learning_rate": 6.006493506493507e-05,
"loss": 0.0024,
"step": 185
},
{
"epoch": 1.5060728744939271,
"grad_norm": 0.015858447179198265,
"learning_rate": 6.03896103896104e-05,
"loss": 0.0038,
"step": 186
},
{
"epoch": 1.5141700404858298,
"grad_norm": 0.015239309519529343,
"learning_rate": 6.0714285714285715e-05,
"loss": 0.0036,
"step": 187
},
{
"epoch": 1.522267206477733,
"grad_norm": 0.01742137223482132,
"learning_rate": 6.103896103896104e-05,
"loss": 0.0034,
"step": 188
},
{
"epoch": 1.5303643724696356,
"grad_norm": 0.01396004669368267,
"learning_rate": 6.136363636363636e-05,
"loss": 0.0028,
"step": 189
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.01871366612613201,
"learning_rate": 6.16883116883117e-05,
"loss": 0.0054,
"step": 190
},
{
"epoch": 1.5465587044534415,
"grad_norm": 0.01240773219615221,
"learning_rate": 6.201298701298701e-05,
"loss": 0.0021,
"step": 191
},
{
"epoch": 1.5546558704453441,
"grad_norm": 0.019545145332813263,
"learning_rate": 6.233766233766233e-05,
"loss": 0.0044,
"step": 192
},
{
"epoch": 1.5627530364372468,
"grad_norm": 0.011620803736150265,
"learning_rate": 6.266233766233767e-05,
"loss": 0.0034,
"step": 193
},
{
"epoch": 1.5708502024291497,
"grad_norm": 0.018584923818707466,
"learning_rate": 6.2987012987013e-05,
"loss": 0.0065,
"step": 194
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.01458702515810728,
"learning_rate": 6.331168831168832e-05,
"loss": 0.0033,
"step": 195
},
{
"epoch": 1.5870445344129553,
"grad_norm": 0.01610160432755947,
"learning_rate": 6.363636363636364e-05,
"loss": 0.0045,
"step": 196
},
{
"epoch": 1.5951417004048583,
"grad_norm": 0.017108574509620667,
"learning_rate": 6.396103896103896e-05,
"loss": 0.0032,
"step": 197
},
{
"epoch": 1.6032388663967612,
"grad_norm": 0.013298330828547478,
"learning_rate": 6.428571428571429e-05,
"loss": 0.0026,
"step": 198
},
{
"epoch": 1.6113360323886639,
"grad_norm": 0.013509229756891727,
"learning_rate": 6.461038961038961e-05,
"loss": 0.0018,
"step": 199
},
{
"epoch": 1.6194331983805668,
"grad_norm": 0.01902744546532631,
"learning_rate": 6.493506493506494e-05,
"loss": 0.0029,
"step": 200
},
{
"epoch": 1.6194331983805668,
"eval_loss": 0.004173097666352987,
"eval_runtime": 20.8941,
"eval_samples_per_second": 4.786,
"eval_steps_per_second": 1.197,
"step": 200
},
{
"epoch": 1.6275303643724697,
"grad_norm": 0.015973802655935287,
"learning_rate": 6.525974025974026e-05,
"loss": 0.0028,
"step": 201
},
{
"epoch": 1.6356275303643724,
"grad_norm": 0.018992941826581955,
"learning_rate": 6.55844155844156e-05,
"loss": 0.0052,
"step": 202
},
{
"epoch": 1.6437246963562753,
"grad_norm": 0.014920739457011223,
"learning_rate": 6.59090909090909e-05,
"loss": 0.0039,
"step": 203
},
{
"epoch": 1.6518218623481782,
"grad_norm": 0.015221747569739819,
"learning_rate": 6.623376623376624e-05,
"loss": 0.0026,
"step": 204
},
{
"epoch": 1.6599190283400809,
"grad_norm": 0.01537750568240881,
"learning_rate": 6.655844155844157e-05,
"loss": 0.0062,
"step": 205
},
{
"epoch": 1.6680161943319838,
"grad_norm": 0.011275989934802055,
"learning_rate": 6.688311688311688e-05,
"loss": 0.0027,
"step": 206
},
{
"epoch": 1.6761133603238867,
"grad_norm": 0.017085865139961243,
"learning_rate": 6.720779220779221e-05,
"loss": 0.0029,
"step": 207
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.012188843451440334,
"learning_rate": 6.753246753246754e-05,
"loss": 0.0033,
"step": 208
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.009229995310306549,
"learning_rate": 6.785714285714286e-05,
"loss": 0.0023,
"step": 209
},
{
"epoch": 1.7004048582995952,
"grad_norm": 0.013247930444777012,
"learning_rate": 6.818181818181818e-05,
"loss": 0.0025,
"step": 210
},
{
"epoch": 1.708502024291498,
"grad_norm": 0.017777971923351288,
"learning_rate": 6.850649350649351e-05,
"loss": 0.0059,
"step": 211
},
{
"epoch": 1.7165991902834008,
"grad_norm": 0.011387546546757221,
"learning_rate": 6.883116883116883e-05,
"loss": 0.0024,
"step": 212
},
{
"epoch": 1.7246963562753037,
"grad_norm": 0.013648373074829578,
"learning_rate": 6.915584415584417e-05,
"loss": 0.0027,
"step": 213
},
{
"epoch": 1.7327935222672064,
"grad_norm": 0.012230796739459038,
"learning_rate": 6.948051948051948e-05,
"loss": 0.0032,
"step": 214
},
{
"epoch": 1.7408906882591093,
"grad_norm": 0.00890358630567789,
"learning_rate": 6.98051948051948e-05,
"loss": 0.0017,
"step": 215
},
{
"epoch": 1.7489878542510122,
"grad_norm": 0.019259551540017128,
"learning_rate": 7.012987012987014e-05,
"loss": 0.0037,
"step": 216
},
{
"epoch": 1.757085020242915,
"grad_norm": 0.01166984811425209,
"learning_rate": 7.045454545454546e-05,
"loss": 0.0029,
"step": 217
},
{
"epoch": 1.7651821862348178,
"grad_norm": 0.014050965197384357,
"learning_rate": 7.077922077922077e-05,
"loss": 0.0019,
"step": 218
},
{
"epoch": 1.7732793522267207,
"grad_norm": 0.012960278429090977,
"learning_rate": 7.110389610389611e-05,
"loss": 0.0019,
"step": 219
},
{
"epoch": 1.7813765182186234,
"grad_norm": 0.01847727596759796,
"learning_rate": 7.142857142857143e-05,
"loss": 0.0042,
"step": 220
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.012762092985212803,
"learning_rate": 7.175324675324676e-05,
"loss": 0.0013,
"step": 221
},
{
"epoch": 1.7975708502024292,
"grad_norm": 0.014623441733419895,
"learning_rate": 7.207792207792208e-05,
"loss": 0.0039,
"step": 222
},
{
"epoch": 1.805668016194332,
"grad_norm": 0.017683332785964012,
"learning_rate": 7.24025974025974e-05,
"loss": 0.0043,
"step": 223
},
{
"epoch": 1.8137651821862348,
"grad_norm": 0.017056427896022797,
"learning_rate": 7.272727272727273e-05,
"loss": 0.0036,
"step": 224
},
{
"epoch": 1.8218623481781377,
"grad_norm": 0.010228103026747704,
"learning_rate": 7.305194805194807e-05,
"loss": 0.0017,
"step": 225
},
{
"epoch": 1.8218623481781377,
"eval_loss": 0.0039197178557515144,
"eval_runtime": 20.8731,
"eval_samples_per_second": 4.791,
"eval_steps_per_second": 1.198,
"step": 225
},
{
"epoch": 1.8299595141700404,
"grad_norm": 0.016191432252526283,
"learning_rate": 7.337662337662338e-05,
"loss": 0.0057,
"step": 226
},
{
"epoch": 1.8380566801619433,
"grad_norm": 0.010266617871820927,
"learning_rate": 7.37012987012987e-05,
"loss": 0.0021,
"step": 227
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.016297219321131706,
"learning_rate": 7.402597402597404e-05,
"loss": 0.0038,
"step": 228
},
{
"epoch": 1.854251012145749,
"grad_norm": 0.011634326539933681,
"learning_rate": 7.435064935064936e-05,
"loss": 0.0033,
"step": 229
},
{
"epoch": 1.8623481781376519,
"grad_norm": 0.01735992170870304,
"learning_rate": 7.467532467532467e-05,
"loss": 0.0045,
"step": 230
},
{
"epoch": 1.8704453441295548,
"grad_norm": 0.012468023225665092,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0039,
"step": 231
},
{
"epoch": 1.8785425101214575,
"grad_norm": 0.01030401885509491,
"learning_rate": 7.532467532467533e-05,
"loss": 0.0033,
"step": 232
},
{
"epoch": 1.8866396761133604,
"grad_norm": 0.008860866539180279,
"learning_rate": 7.564935064935065e-05,
"loss": 0.0025,
"step": 233
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.014425918459892273,
"learning_rate": 7.597402597402598e-05,
"loss": 0.0054,
"step": 234
},
{
"epoch": 1.902834008097166,
"grad_norm": 0.012539315037429333,
"learning_rate": 7.62987012987013e-05,
"loss": 0.0044,
"step": 235
},
{
"epoch": 1.9109311740890689,
"grad_norm": 0.0120421526953578,
"learning_rate": 7.662337662337662e-05,
"loss": 0.0048,
"step": 236
},
{
"epoch": 1.9190283400809718,
"grad_norm": 0.011059713549911976,
"learning_rate": 7.694805194805195e-05,
"loss": 0.0024,
"step": 237
},
{
"epoch": 1.9271255060728745,
"grad_norm": 0.01062751654535532,
"learning_rate": 7.727272727272727e-05,
"loss": 0.0025,
"step": 238
},
{
"epoch": 1.9352226720647774,
"grad_norm": 0.009996469132602215,
"learning_rate": 7.75974025974026e-05,
"loss": 0.0022,
"step": 239
},
{
"epoch": 1.9433198380566803,
"grad_norm": 0.014030283316969872,
"learning_rate": 7.792207792207793e-05,
"loss": 0.0027,
"step": 240
},
{
"epoch": 1.951417004048583,
"grad_norm": 0.011797044426202774,
"learning_rate": 7.824675324675324e-05,
"loss": 0.0039,
"step": 241
},
{
"epoch": 1.9595141700404857,
"grad_norm": 0.014973408542573452,
"learning_rate": 7.857142857142858e-05,
"loss": 0.0039,
"step": 242
},
{
"epoch": 1.9676113360323888,
"grad_norm": 0.01119126658886671,
"learning_rate": 7.88961038961039e-05,
"loss": 0.0021,
"step": 243
},
{
"epoch": 1.9757085020242915,
"grad_norm": 0.012466533109545708,
"learning_rate": 7.922077922077923e-05,
"loss": 0.0024,
"step": 244
},
{
"epoch": 1.9838056680161942,
"grad_norm": 0.01311230007559061,
"learning_rate": 7.954545454545455e-05,
"loss": 0.0037,
"step": 245
},
{
"epoch": 1.9919028340080973,
"grad_norm": 0.01020133588463068,
"learning_rate": 7.987012987012987e-05,
"loss": 0.0027,
"step": 246
},
{
"epoch": 2.0,
"grad_norm": 0.01899588108062744,
"learning_rate": 8.01948051948052e-05,
"loss": 0.0032,
"step": 247
},
{
"epoch": 2.0080971659919027,
"grad_norm": 0.011334598064422607,
"learning_rate": 8.051948051948052e-05,
"loss": 0.002,
"step": 248
},
{
"epoch": 2.016194331983806,
"grad_norm": 0.011807809583842754,
"learning_rate": 8.084415584415585e-05,
"loss": 0.002,
"step": 249
},
{
"epoch": 2.0242914979757085,
"grad_norm": 0.010670343413949013,
"learning_rate": 8.116883116883117e-05,
"loss": 0.0022,
"step": 250
},
{
"epoch": 2.0242914979757085,
"eval_loss": 0.0035137999802827835,
"eval_runtime": 20.89,
"eval_samples_per_second": 4.787,
"eval_steps_per_second": 1.197,
"step": 250
},
{
"epoch": 2.032388663967611,
"grad_norm": 0.011268955655395985,
"learning_rate": 8.14935064935065e-05,
"loss": 0.0016,
"step": 251
},
{
"epoch": 2.0404858299595143,
"grad_norm": 0.013640797697007656,
"learning_rate": 8.181818181818183e-05,
"loss": 0.0029,
"step": 252
},
{
"epoch": 2.048582995951417,
"grad_norm": 0.008933900855481625,
"learning_rate": 8.214285714285714e-05,
"loss": 0.0021,
"step": 253
},
{
"epoch": 2.0566801619433197,
"grad_norm": 0.012379194609820843,
"learning_rate": 8.246753246753248e-05,
"loss": 0.0026,
"step": 254
},
{
"epoch": 2.064777327935223,
"grad_norm": 0.015894446521997452,
"learning_rate": 8.27922077922078e-05,
"loss": 0.002,
"step": 255
},
{
"epoch": 2.0728744939271255,
"grad_norm": 0.013013158924877644,
"learning_rate": 8.311688311688312e-05,
"loss": 0.002,
"step": 256
},
{
"epoch": 2.080971659919028,
"grad_norm": 0.00733231520280242,
"learning_rate": 8.344155844155845e-05,
"loss": 0.0012,
"step": 257
},
{
"epoch": 2.0890688259109313,
"grad_norm": 0.011731351725757122,
"learning_rate": 8.376623376623377e-05,
"loss": 0.0028,
"step": 258
},
{
"epoch": 2.097165991902834,
"grad_norm": 0.010311335325241089,
"learning_rate": 8.40909090909091e-05,
"loss": 0.0017,
"step": 259
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.010359687730669975,
"learning_rate": 8.441558441558442e-05,
"loss": 0.0033,
"step": 260
},
{
"epoch": 2.11336032388664,
"grad_norm": 0.017441799864172935,
"learning_rate": 8.474025974025974e-05,
"loss": 0.0032,
"step": 261
},
{
"epoch": 2.1214574898785425,
"grad_norm": 0.014498945325613022,
"learning_rate": 8.506493506493507e-05,
"loss": 0.0048,
"step": 262
},
{
"epoch": 2.1295546558704452,
"grad_norm": 0.011659245006740093,
"learning_rate": 8.53896103896104e-05,
"loss": 0.0021,
"step": 263
},
{
"epoch": 2.1376518218623484,
"grad_norm": 0.01307612657546997,
"learning_rate": 8.571428571428571e-05,
"loss": 0.0024,
"step": 264
},
{
"epoch": 2.145748987854251,
"grad_norm": 0.007933567278087139,
"learning_rate": 8.603896103896104e-05,
"loss": 0.0017,
"step": 265
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.010644306428730488,
"learning_rate": 8.636363636363637e-05,
"loss": 0.0022,
"step": 266
},
{
"epoch": 2.161943319838057,
"grad_norm": 0.017315855249762535,
"learning_rate": 8.66883116883117e-05,
"loss": 0.0013,
"step": 267
},
{
"epoch": 2.1700404858299596,
"grad_norm": 0.013733215630054474,
"learning_rate": 8.701298701298701e-05,
"loss": 0.0019,
"step": 268
},
{
"epoch": 2.1781376518218623,
"grad_norm": 0.019037563353776932,
"learning_rate": 8.733766233766234e-05,
"loss": 0.0044,
"step": 269
},
{
"epoch": 2.1862348178137654,
"grad_norm": 0.014429651200771332,
"learning_rate": 8.766233766233767e-05,
"loss": 0.003,
"step": 270
},
{
"epoch": 2.194331983805668,
"grad_norm": 0.013059676624834538,
"learning_rate": 8.798701298701299e-05,
"loss": 0.0022,
"step": 271
},
{
"epoch": 2.2024291497975708,
"grad_norm": 0.011815879493951797,
"learning_rate": 8.831168831168831e-05,
"loss": 0.0016,
"step": 272
},
{
"epoch": 2.2105263157894735,
"grad_norm": 0.010117270052433014,
"learning_rate": 8.863636363636364e-05,
"loss": 0.0018,
"step": 273
},
{
"epoch": 2.2186234817813766,
"grad_norm": 0.012147231958806515,
"learning_rate": 8.896103896103896e-05,
"loss": 0.0028,
"step": 274
},
{
"epoch": 2.2267206477732793,
"grad_norm": 0.013275344856083393,
"learning_rate": 8.92857142857143e-05,
"loss": 0.0027,
"step": 275
},
{
"epoch": 2.2267206477732793,
"eval_loss": 0.003232660237699747,
"eval_runtime": 20.8846,
"eval_samples_per_second": 4.788,
"eval_steps_per_second": 1.197,
"step": 275
},
{
"epoch": 2.234817813765182,
"grad_norm": 0.009600469842553139,
"learning_rate": 8.961038961038961e-05,
"loss": 0.0018,
"step": 276
},
{
"epoch": 2.242914979757085,
"grad_norm": 0.013018307276070118,
"learning_rate": 8.993506493506493e-05,
"loss": 0.0013,
"step": 277
},
{
"epoch": 2.251012145748988,
"grad_norm": 0.013700856827199459,
"learning_rate": 9.025974025974027e-05,
"loss": 0.0026,
"step": 278
},
{
"epoch": 2.2591093117408905,
"grad_norm": 0.012119555845856667,
"learning_rate": 9.05844155844156e-05,
"loss": 0.0014,
"step": 279
},
{
"epoch": 2.2672064777327936,
"grad_norm": 0.01446222048252821,
"learning_rate": 9.090909090909092e-05,
"loss": 0.0016,
"step": 280
},
{
"epoch": 2.2753036437246963,
"grad_norm": 0.008024114184081554,
"learning_rate": 9.123376623376624e-05,
"loss": 0.0014,
"step": 281
},
{
"epoch": 2.283400809716599,
"grad_norm": 0.015081741847097874,
"learning_rate": 9.155844155844156e-05,
"loss": 0.0027,
"step": 282
},
{
"epoch": 2.291497975708502,
"grad_norm": 0.012656310573220253,
"learning_rate": 9.188311688311689e-05,
"loss": 0.0019,
"step": 283
},
{
"epoch": 2.299595141700405,
"grad_norm": 0.016468411311507225,
"learning_rate": 9.220779220779221e-05,
"loss": 0.0025,
"step": 284
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.0127490209415555,
"learning_rate": 9.253246753246754e-05,
"loss": 0.0019,
"step": 285
},
{
"epoch": 2.3157894736842106,
"grad_norm": 0.01162753626704216,
"learning_rate": 9.285714285714286e-05,
"loss": 0.0017,
"step": 286
},
{
"epoch": 2.3238866396761133,
"grad_norm": 0.01099877618253231,
"learning_rate": 9.318181818181818e-05,
"loss": 0.0016,
"step": 287
},
{
"epoch": 2.331983805668016,
"grad_norm": 0.009699794463813305,
"learning_rate": 9.35064935064935e-05,
"loss": 0.0018,
"step": 288
},
{
"epoch": 2.340080971659919,
"grad_norm": 0.011390355415642262,
"learning_rate": 9.383116883116884e-05,
"loss": 0.0024,
"step": 289
},
{
"epoch": 2.348178137651822,
"grad_norm": 0.01110926829278469,
"learning_rate": 9.415584415584417e-05,
"loss": 0.0016,
"step": 290
},
{
"epoch": 2.3562753036437245,
"grad_norm": 0.012503352016210556,
"learning_rate": 9.448051948051948e-05,
"loss": 0.0018,
"step": 291
},
{
"epoch": 2.3643724696356276,
"grad_norm": 0.01324179582297802,
"learning_rate": 9.480519480519481e-05,
"loss": 0.0025,
"step": 292
},
{
"epoch": 2.3724696356275303,
"grad_norm": 0.010324635542929173,
"learning_rate": 9.512987012987014e-05,
"loss": 0.0018,
"step": 293
},
{
"epoch": 2.380566801619433,
"grad_norm": 0.010333823971450329,
"learning_rate": 9.545454545454546e-05,
"loss": 0.0012,
"step": 294
},
{
"epoch": 2.388663967611336,
"grad_norm": 0.011566666886210442,
"learning_rate": 9.577922077922078e-05,
"loss": 0.0023,
"step": 295
},
{
"epoch": 2.396761133603239,
"grad_norm": 0.008786414749920368,
"learning_rate": 9.610389610389611e-05,
"loss": 0.0016,
"step": 296
},
{
"epoch": 2.4048582995951415,
"grad_norm": 0.011586328037083149,
"learning_rate": 9.642857142857143e-05,
"loss": 0.0023,
"step": 297
},
{
"epoch": 2.4129554655870447,
"grad_norm": 0.014018291607499123,
"learning_rate": 9.675324675324677e-05,
"loss": 0.0019,
"step": 298
},
{
"epoch": 2.4210526315789473,
"grad_norm": 0.008588538505136967,
"learning_rate": 9.707792207792208e-05,
"loss": 0.0009,
"step": 299
},
{
"epoch": 2.42914979757085,
"grad_norm": 0.009654571302235126,
"learning_rate": 9.74025974025974e-05,
"loss": 0.0017,
"step": 300
},
{
"epoch": 2.42914979757085,
"eval_loss": 0.003001517616212368,
"eval_runtime": 20.9488,
"eval_samples_per_second": 4.774,
"eval_steps_per_second": 1.193,
"step": 300
},
{
"epoch": 2.437246963562753,
"grad_norm": 0.01593548245728016,
"learning_rate": 9.772727272727274e-05,
"loss": 0.0027,
"step": 301
},
{
"epoch": 2.445344129554656,
"grad_norm": 0.01584690809249878,
"learning_rate": 9.805194805194806e-05,
"loss": 0.0031,
"step": 302
},
{
"epoch": 2.4534412955465585,
"grad_norm": 0.01336862612515688,
"learning_rate": 9.837662337662337e-05,
"loss": 0.0032,
"step": 303
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.009371182881295681,
"learning_rate": 9.870129870129871e-05,
"loss": 0.0015,
"step": 304
},
{
"epoch": 2.4696356275303644,
"grad_norm": 0.012227087281644344,
"learning_rate": 9.902597402597403e-05,
"loss": 0.0015,
"step": 305
},
{
"epoch": 2.477732793522267,
"grad_norm": 0.009863483719527721,
"learning_rate": 9.935064935064936e-05,
"loss": 0.0021,
"step": 306
},
{
"epoch": 2.48582995951417,
"grad_norm": 0.013306519947946072,
"learning_rate": 9.967532467532468e-05,
"loss": 0.0032,
"step": 307
},
{
"epoch": 2.493927125506073,
"grad_norm": 0.009393845684826374,
"learning_rate": 0.0001,
"loss": 0.0016,
"step": 308
},
{
"epoch": 2.5020242914979756,
"grad_norm": 0.009558003395795822,
"learning_rate": 9.999996777288795e-05,
"loss": 0.0021,
"step": 309
},
{
"epoch": 2.5101214574898787,
"grad_norm": 0.01038302294909954,
"learning_rate": 9.999987109159334e-05,
"loss": 0.003,
"step": 310
},
{
"epoch": 2.5182186234817814,
"grad_norm": 0.011483744718134403,
"learning_rate": 9.999970995624077e-05,
"loss": 0.0026,
"step": 311
},
{
"epoch": 2.526315789473684,
"grad_norm": 0.012138905003666878,
"learning_rate": 9.9999484367038e-05,
"loss": 0.0018,
"step": 312
},
{
"epoch": 2.534412955465587,
"grad_norm": 0.008925210684537888,
"learning_rate": 9.999919432427583e-05,
"loss": 0.0012,
"step": 313
},
{
"epoch": 2.54251012145749,
"grad_norm": 0.0103675602003932,
"learning_rate": 9.999883982832811e-05,
"loss": 0.0015,
"step": 314
},
{
"epoch": 2.5506072874493926,
"grad_norm": 0.010114219971001148,
"learning_rate": 9.999842087965185e-05,
"loss": 0.0027,
"step": 315
},
{
"epoch": 2.5587044534412957,
"grad_norm": 0.011849566362798214,
"learning_rate": 9.999793747878712e-05,
"loss": 0.0035,
"step": 316
},
{
"epoch": 2.5668016194331984,
"grad_norm": 0.022183779627084732,
"learning_rate": 9.999738962635703e-05,
"loss": 0.0022,
"step": 317
},
{
"epoch": 2.574898785425101,
"grad_norm": 0.01708986796438694,
"learning_rate": 9.999677732306782e-05,
"loss": 0.0021,
"step": 318
},
{
"epoch": 2.582995951417004,
"grad_norm": 0.012563329190015793,
"learning_rate": 9.999610056970881e-05,
"loss": 0.0015,
"step": 319
},
{
"epoch": 2.591093117408907,
"grad_norm": 0.018579822033643723,
"learning_rate": 9.999535936715239e-05,
"loss": 0.0035,
"step": 320
},
{
"epoch": 2.5991902834008096,
"grad_norm": 0.014235563576221466,
"learning_rate": 9.999455371635402e-05,
"loss": 0.0022,
"step": 321
},
{
"epoch": 2.6072874493927127,
"grad_norm": 0.013800045475363731,
"learning_rate": 9.999368361835226e-05,
"loss": 0.0034,
"step": 322
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.010226680897176266,
"learning_rate": 9.999274907426876e-05,
"loss": 0.0015,
"step": 323
},
{
"epoch": 2.623481781376518,
"grad_norm": 0.0106669832020998,
"learning_rate": 9.99917500853082e-05,
"loss": 0.0025,
"step": 324
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.007497102487832308,
"learning_rate": 9.999068665275834e-05,
"loss": 0.0012,
"step": 325
},
{
"epoch": 2.6315789473684212,
"eval_loss": 0.002751573920249939,
"eval_runtime": 20.8798,
"eval_samples_per_second": 4.789,
"eval_steps_per_second": 1.197,
"step": 325
},
{
"epoch": 2.639676113360324,
"grad_norm": 0.015462521463632584,
"learning_rate": 9.99895587779901e-05,
"loss": 0.0026,
"step": 326
},
{
"epoch": 2.6477732793522266,
"grad_norm": 0.008158071897923946,
"learning_rate": 9.998836646245735e-05,
"loss": 0.0011,
"step": 327
},
{
"epoch": 2.6558704453441297,
"grad_norm": 0.010231217369437218,
"learning_rate": 9.998710970769711e-05,
"loss": 0.0025,
"step": 328
},
{
"epoch": 2.6639676113360324,
"grad_norm": 0.011228160932660103,
"learning_rate": 9.998578851532945e-05,
"loss": 0.0022,
"step": 329
},
{
"epoch": 2.672064777327935,
"grad_norm": 0.010875989682972431,
"learning_rate": 9.998440288705747e-05,
"loss": 0.0028,
"step": 330
},
{
"epoch": 2.6801619433198383,
"grad_norm": 0.009110379964113235,
"learning_rate": 9.998295282466738e-05,
"loss": 0.0015,
"step": 331
},
{
"epoch": 2.688259109311741,
"grad_norm": 0.009458299726247787,
"learning_rate": 9.998143833002845e-05,
"loss": 0.0016,
"step": 332
},
{
"epoch": 2.6963562753036436,
"grad_norm": 0.00835677981376648,
"learning_rate": 9.997985940509295e-05,
"loss": 0.0013,
"step": 333
},
{
"epoch": 2.7044534412955468,
"grad_norm": 0.009965039789676666,
"learning_rate": 9.997821605189627e-05,
"loss": 0.0022,
"step": 334
},
{
"epoch": 2.7125506072874495,
"grad_norm": 0.010593763552606106,
"learning_rate": 9.997650827255685e-05,
"loss": 0.0015,
"step": 335
},
{
"epoch": 2.720647773279352,
"grad_norm": 0.010097038000822067,
"learning_rate": 9.997473606927612e-05,
"loss": 0.0015,
"step": 336
},
{
"epoch": 2.7287449392712553,
"grad_norm": 0.010393706150352955,
"learning_rate": 9.997289944433864e-05,
"loss": 0.0022,
"step": 337
},
{
"epoch": 2.736842105263158,
"grad_norm": 0.01462769228965044,
"learning_rate": 9.997099840011195e-05,
"loss": 0.0039,
"step": 338
},
{
"epoch": 2.7449392712550607,
"grad_norm": 0.010113600641489029,
"learning_rate": 9.996903293904666e-05,
"loss": 0.0018,
"step": 339
},
{
"epoch": 2.753036437246964,
"grad_norm": 0.011077907867729664,
"learning_rate": 9.996700306367643e-05,
"loss": 0.0009,
"step": 340
},
{
"epoch": 2.7611336032388665,
"grad_norm": 0.00902781542390585,
"learning_rate": 9.996490877661793e-05,
"loss": 0.0016,
"step": 341
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.014123060740530491,
"learning_rate": 9.996275008057087e-05,
"loss": 0.0027,
"step": 342
},
{
"epoch": 2.7773279352226723,
"grad_norm": 0.014702217653393745,
"learning_rate": 9.9960526978318e-05,
"loss": 0.0026,
"step": 343
},
{
"epoch": 2.785425101214575,
"grad_norm": 0.007217355538159609,
"learning_rate": 9.995823947272506e-05,
"loss": 0.0009,
"step": 344
},
{
"epoch": 2.7935222672064777,
"grad_norm": 0.013469511643052101,
"learning_rate": 9.995588756674088e-05,
"loss": 0.0027,
"step": 345
},
{
"epoch": 2.801619433198381,
"grad_norm": 0.012271186336874962,
"learning_rate": 9.995347126339725e-05,
"loss": 0.0013,
"step": 346
},
{
"epoch": 2.8097165991902835,
"grad_norm": 0.012494235299527645,
"learning_rate": 9.995099056580896e-05,
"loss": 0.0018,
"step": 347
},
{
"epoch": 2.817813765182186,
"grad_norm": 0.008622893132269382,
"learning_rate": 9.994844547717388e-05,
"loss": 0.0017,
"step": 348
},
{
"epoch": 2.8259109311740893,
"grad_norm": 0.011038469150662422,
"learning_rate": 9.994583600077283e-05,
"loss": 0.0017,
"step": 349
},
{
"epoch": 2.834008097165992,
"grad_norm": 0.010193824768066406,
"learning_rate": 9.994316213996964e-05,
"loss": 0.002,
"step": 350
},
{
"epoch": 2.834008097165992,
"eval_loss": 0.0024985964410007,
"eval_runtime": 20.8778,
"eval_samples_per_second": 4.79,
"eval_steps_per_second": 1.197,
"step": 350
},
{
"epoch": 2.8421052631578947,
"grad_norm": 0.008073719218373299,
"learning_rate": 9.994042389821114e-05,
"loss": 0.0013,
"step": 351
},
{
"epoch": 2.850202429149798,
"grad_norm": 0.007987983524799347,
"learning_rate": 9.993762127902717e-05,
"loss": 0.0012,
"step": 352
},
{
"epoch": 2.8582995951417005,
"grad_norm": 0.008735005743801594,
"learning_rate": 9.993475428603052e-05,
"loss": 0.0013,
"step": 353
},
{
"epoch": 2.866396761133603,
"grad_norm": 0.009095696732401848,
"learning_rate": 9.9931822922917e-05,
"loss": 0.0016,
"step": 354
},
{
"epoch": 2.8744939271255063,
"grad_norm": 0.011008110828697681,
"learning_rate": 9.992882719346539e-05,
"loss": 0.0021,
"step": 355
},
{
"epoch": 2.882591093117409,
"grad_norm": 0.009820708073675632,
"learning_rate": 9.992576710153743e-05,
"loss": 0.0024,
"step": 356
},
{
"epoch": 2.8906882591093117,
"grad_norm": 0.013662457466125488,
"learning_rate": 9.992264265107784e-05,
"loss": 0.0013,
"step": 357
},
{
"epoch": 2.898785425101215,
"grad_norm": 0.009639346040785313,
"learning_rate": 9.991945384611431e-05,
"loss": 0.0018,
"step": 358
},
{
"epoch": 2.9068825910931175,
"grad_norm": 0.008260666392743587,
"learning_rate": 9.991620069075745e-05,
"loss": 0.0011,
"step": 359
},
{
"epoch": 2.91497975708502,
"grad_norm": 0.015122090466320515,
"learning_rate": 9.991288318920089e-05,
"loss": 0.0012,
"step": 360
},
{
"epoch": 2.9230769230769234,
"grad_norm": 0.011863719671964645,
"learning_rate": 9.990950134572113e-05,
"loss": 0.0015,
"step": 361
},
{
"epoch": 2.931174089068826,
"grad_norm": 0.013348613865673542,
"learning_rate": 9.990605516467769e-05,
"loss": 0.0022,
"step": 362
},
{
"epoch": 2.9392712550607287,
"grad_norm": 0.01165574137121439,
"learning_rate": 9.990254465051297e-05,
"loss": 0.0023,
"step": 363
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.010028968565165997,
"learning_rate": 9.98989698077523e-05,
"loss": 0.0022,
"step": 364
},
{
"epoch": 2.9554655870445345,
"grad_norm": 0.008236058987677097,
"learning_rate": 9.9895330641004e-05,
"loss": 0.0012,
"step": 365
},
{
"epoch": 2.9635627530364372,
"grad_norm": 0.012825064361095428,
"learning_rate": 9.989162715495923e-05,
"loss": 0.0021,
"step": 366
},
{
"epoch": 2.97165991902834,
"grad_norm": 0.014107435010373592,
"learning_rate": 9.98878593543921e-05,
"loss": 0.0036,
"step": 367
},
{
"epoch": 2.979757085020243,
"grad_norm": 0.014563079923391342,
"learning_rate": 9.988402724415964e-05,
"loss": 0.0025,
"step": 368
},
{
"epoch": 2.9878542510121457,
"grad_norm": 0.011997046880424023,
"learning_rate": 9.988013082920173e-05,
"loss": 0.0035,
"step": 369
},
{
"epoch": 2.9959514170040484,
"grad_norm": 0.006900448817759752,
"learning_rate": 9.987617011454122e-05,
"loss": 0.0013,
"step": 370
},
{
"epoch": 3.0040485829959516,
"grad_norm": 0.015069114975631237,
"learning_rate": 9.987214510528378e-05,
"loss": 0.0025,
"step": 371
},
{
"epoch": 3.0121457489878543,
"grad_norm": 0.007571605499833822,
"learning_rate": 9.9868055806618e-05,
"loss": 0.0015,
"step": 372
},
{
"epoch": 3.020242914979757,
"grad_norm": 0.008344545029103756,
"learning_rate": 9.98639022238153e-05,
"loss": 0.0017,
"step": 373
},
{
"epoch": 3.02834008097166,
"grad_norm": 0.011865397915244102,
"learning_rate": 9.985968436223005e-05,
"loss": 0.0021,
"step": 374
},
{
"epoch": 3.0364372469635628,
"grad_norm": 0.006094765849411488,
"learning_rate": 9.985540222729939e-05,
"loss": 0.0008,
"step": 375
},
{
"epoch": 3.0364372469635628,
"eval_loss": 0.00258046155795455,
"eval_runtime": 20.9003,
"eval_samples_per_second": 4.785,
"eval_steps_per_second": 1.196,
"step": 375
},
{
"epoch": 3.0445344129554655,
"grad_norm": 0.009989949874579906,
"learning_rate": 9.985105582454336e-05,
"loss": 0.0013,
"step": 376
},
{
"epoch": 3.0526315789473686,
"grad_norm": 0.010446115396916866,
"learning_rate": 9.984664515956486e-05,
"loss": 0.0014,
"step": 377
},
{
"epoch": 3.0607287449392713,
"grad_norm": 0.018476417288184166,
"learning_rate": 9.984217023804958e-05,
"loss": 0.0021,
"step": 378
},
{
"epoch": 3.068825910931174,
"grad_norm": 0.01866602897644043,
"learning_rate": 9.983763106576612e-05,
"loss": 0.0032,
"step": 379
},
{
"epoch": 3.076923076923077,
"grad_norm": 0.014463922940194607,
"learning_rate": 9.983302764856579e-05,
"loss": 0.0011,
"step": 380
},
{
"epoch": 3.08502024291498,
"grad_norm": 0.010341525077819824,
"learning_rate": 9.982835999238285e-05,
"loss": 0.0012,
"step": 381
},
{
"epoch": 3.0931174089068825,
"grad_norm": 0.00995098240673542,
"learning_rate": 9.982362810323424e-05,
"loss": 0.0012,
"step": 382
},
{
"epoch": 3.1012145748987856,
"grad_norm": 0.009842706844210625,
"learning_rate": 9.981883198721981e-05,
"loss": 0.0008,
"step": 383
},
{
"epoch": 3.1093117408906883,
"grad_norm": 0.013185705058276653,
"learning_rate": 9.981397165052215e-05,
"loss": 0.0023,
"step": 384
},
{
"epoch": 3.117408906882591,
"grad_norm": 0.009965223260223866,
"learning_rate": 9.980904709940666e-05,
"loss": 0.0012,
"step": 385
},
{
"epoch": 3.125506072874494,
"grad_norm": 0.011264238506555557,
"learning_rate": 9.980405834022146e-05,
"loss": 0.0018,
"step": 386
},
{
"epoch": 3.133603238866397,
"grad_norm": 0.006440012715756893,
"learning_rate": 9.97990053793975e-05,
"loss": 0.0007,
"step": 387
},
{
"epoch": 3.1417004048582995,
"grad_norm": 0.016382023692131042,
"learning_rate": 9.979388822344848e-05,
"loss": 0.0014,
"step": 388
},
{
"epoch": 3.1497975708502026,
"grad_norm": 0.008492662571370602,
"learning_rate": 9.978870687897086e-05,
"loss": 0.0012,
"step": 389
},
{
"epoch": 3.1578947368421053,
"grad_norm": 0.008105689659714699,
"learning_rate": 9.978346135264381e-05,
"loss": 0.0016,
"step": 390
},
{
"epoch": 3.165991902834008,
"grad_norm": 0.007508544716984034,
"learning_rate": 9.977815165122926e-05,
"loss": 0.001,
"step": 391
},
{
"epoch": 3.174089068825911,
"grad_norm": 0.00950626190751791,
"learning_rate": 9.977277778157186e-05,
"loss": 0.002,
"step": 392
},
{
"epoch": 3.182186234817814,
"grad_norm": 0.009004231542348862,
"learning_rate": 9.976733975059899e-05,
"loss": 0.0015,
"step": 393
},
{
"epoch": 3.1902834008097165,
"grad_norm": 0.0037491165567189455,
"learning_rate": 9.976183756532072e-05,
"loss": 0.0004,
"step": 394
},
{
"epoch": 3.1983805668016196,
"grad_norm": 0.01322921272367239,
"learning_rate": 9.975627123282985e-05,
"loss": 0.0017,
"step": 395
},
{
"epoch": 3.2064777327935223,
"grad_norm": 0.006518376059830189,
"learning_rate": 9.975064076030184e-05,
"loss": 0.0008,
"step": 396
},
{
"epoch": 3.214574898785425,
"grad_norm": 0.011334234848618507,
"learning_rate": 9.974494615499487e-05,
"loss": 0.0017,
"step": 397
},
{
"epoch": 3.2226720647773277,
"grad_norm": 0.007272353395819664,
"learning_rate": 9.973918742424972e-05,
"loss": 0.001,
"step": 398
},
{
"epoch": 3.230769230769231,
"grad_norm": 0.009874061681330204,
"learning_rate": 9.973336457548992e-05,
"loss": 0.0015,
"step": 399
},
{
"epoch": 3.2388663967611335,
"grad_norm": 0.007885873317718506,
"learning_rate": 9.972747761622159e-05,
"loss": 0.0012,
"step": 400
},
{
"epoch": 3.2388663967611335,
"eval_loss": 0.0024508482310920954,
"eval_runtime": 20.9363,
"eval_samples_per_second": 4.776,
"eval_steps_per_second": 1.194,
"step": 400
},
{
"epoch": 3.246963562753036,
"grad_norm": 0.007821009494364262,
"learning_rate": 9.972152655403353e-05,
"loss": 0.001,
"step": 401
},
{
"epoch": 3.2550607287449393,
"grad_norm": 0.014811043627560139,
"learning_rate": 9.971551139659716e-05,
"loss": 0.0024,
"step": 402
},
{
"epoch": 3.263157894736842,
"grad_norm": 0.009398553520441055,
"learning_rate": 9.970943215166652e-05,
"loss": 0.0014,
"step": 403
},
{
"epoch": 3.2712550607287447,
"grad_norm": 0.007041712291538715,
"learning_rate": 9.970328882707829e-05,
"loss": 0.0012,
"step": 404
},
{
"epoch": 3.279352226720648,
"grad_norm": 0.0057219392620027065,
"learning_rate": 9.969708143075171e-05,
"loss": 0.0005,
"step": 405
},
{
"epoch": 3.2874493927125505,
"grad_norm": 0.01213089469820261,
"learning_rate": 9.969080997068865e-05,
"loss": 0.0018,
"step": 406
},
{
"epoch": 3.2955465587044532,
"grad_norm": 0.007616452407091856,
"learning_rate": 9.968447445497356e-05,
"loss": 0.0009,
"step": 407
},
{
"epoch": 3.3036437246963564,
"grad_norm": 0.006538788788020611,
"learning_rate": 9.967807489177344e-05,
"loss": 0.0011,
"step": 408
},
{
"epoch": 3.311740890688259,
"grad_norm": 0.01062245387583971,
"learning_rate": 9.967161128933788e-05,
"loss": 0.0014,
"step": 409
},
{
"epoch": 3.3198380566801617,
"grad_norm": 0.011872652918100357,
"learning_rate": 9.966508365599899e-05,
"loss": 0.0016,
"step": 410
},
{
"epoch": 3.327935222672065,
"grad_norm": 0.012371920980513096,
"learning_rate": 9.965849200017145e-05,
"loss": 0.0021,
"step": 411
},
{
"epoch": 3.3360323886639676,
"grad_norm": 0.009583608247339725,
"learning_rate": 9.965183633035249e-05,
"loss": 0.0014,
"step": 412
},
{
"epoch": 3.3441295546558703,
"grad_norm": 0.009689375758171082,
"learning_rate": 9.964511665512179e-05,
"loss": 0.0011,
"step": 413
},
{
"epoch": 3.3522267206477734,
"grad_norm": 0.007418768480420113,
"learning_rate": 9.963833298314159e-05,
"loss": 0.001,
"step": 414
},
{
"epoch": 3.360323886639676,
"grad_norm": 0.009091746993362904,
"learning_rate": 9.963148532315663e-05,
"loss": 0.0014,
"step": 415
},
{
"epoch": 3.3684210526315788,
"grad_norm": 0.007394388318061829,
"learning_rate": 9.962457368399409e-05,
"loss": 0.0012,
"step": 416
},
{
"epoch": 3.376518218623482,
"grad_norm": 0.011532511562108994,
"learning_rate": 9.96175980745637e-05,
"loss": 0.0008,
"step": 417
},
{
"epoch": 3.3846153846153846,
"grad_norm": 0.009239987470209599,
"learning_rate": 9.961055850385759e-05,
"loss": 0.0015,
"step": 418
},
{
"epoch": 3.3927125506072873,
"grad_norm": 0.009725586511194706,
"learning_rate": 9.960345498095036e-05,
"loss": 0.0019,
"step": 419
},
{
"epoch": 3.4008097165991904,
"grad_norm": 0.008276725187897682,
"learning_rate": 9.959628751499906e-05,
"loss": 0.001,
"step": 420
},
{
"epoch": 3.408906882591093,
"grad_norm": 0.007257894612848759,
"learning_rate": 9.958905611524313e-05,
"loss": 0.0008,
"step": 421
},
{
"epoch": 3.417004048582996,
"grad_norm": 0.007111871615052223,
"learning_rate": 9.95817607910045e-05,
"loss": 0.0007,
"step": 422
},
{
"epoch": 3.425101214574899,
"grad_norm": 0.008013184182345867,
"learning_rate": 9.957440155168743e-05,
"loss": 0.0005,
"step": 423
},
{
"epoch": 3.4331983805668016,
"grad_norm": 0.007757282350212336,
"learning_rate": 9.95669784067786e-05,
"loss": 0.0012,
"step": 424
},
{
"epoch": 3.4412955465587043,
"grad_norm": 0.01001273188740015,
"learning_rate": 9.955949136584709e-05,
"loss": 0.0013,
"step": 425
},
{
"epoch": 3.4412955465587043,
"eval_loss": 0.0021295032929629087,
"eval_runtime": 20.8824,
"eval_samples_per_second": 4.789,
"eval_steps_per_second": 1.197,
"step": 425
},
{
"epoch": 3.4493927125506074,
"grad_norm": 0.0063932668417692184,
"learning_rate": 9.95519404385443e-05,
"loss": 0.001,
"step": 426
},
{
"epoch": 3.45748987854251,
"grad_norm": 0.010445266962051392,
"learning_rate": 9.954432563460403e-05,
"loss": 0.0013,
"step": 427
},
{
"epoch": 3.465587044534413,
"grad_norm": 0.006435538176447153,
"learning_rate": 9.953664696384242e-05,
"loss": 0.0007,
"step": 428
},
{
"epoch": 3.473684210526316,
"grad_norm": 0.004963045008480549,
"learning_rate": 9.95289044361579e-05,
"loss": 0.0008,
"step": 429
},
{
"epoch": 3.4817813765182186,
"grad_norm": 0.02623671293258667,
"learning_rate": 9.952109806153125e-05,
"loss": 0.0009,
"step": 430
},
{
"epoch": 3.4898785425101213,
"grad_norm": 0.00943893101066351,
"learning_rate": 9.951322785002554e-05,
"loss": 0.0012,
"step": 431
},
{
"epoch": 3.4979757085020244,
"grad_norm": 0.0059456489980220795,
"learning_rate": 9.950529381178617e-05,
"loss": 0.0009,
"step": 432
},
{
"epoch": 3.506072874493927,
"grad_norm": 0.00898104626685381,
"learning_rate": 9.949729595704076e-05,
"loss": 0.0014,
"step": 433
},
{
"epoch": 3.51417004048583,
"grad_norm": 0.009014531970024109,
"learning_rate": 9.948923429609921e-05,
"loss": 0.0012,
"step": 434
},
{
"epoch": 3.522267206477733,
"grad_norm": 0.012250890955328941,
"learning_rate": 9.948110883935371e-05,
"loss": 0.0024,
"step": 435
},
{
"epoch": 3.5303643724696356,
"grad_norm": 0.009938407689332962,
"learning_rate": 9.947291959727863e-05,
"loss": 0.0012,
"step": 436
},
{
"epoch": 3.5384615384615383,
"grad_norm": 0.009354399517178535,
"learning_rate": 9.94646665804306e-05,
"loss": 0.0015,
"step": 437
},
{
"epoch": 3.5465587044534415,
"grad_norm": 0.008449352346360683,
"learning_rate": 9.94563497994485e-05,
"loss": 0.0016,
"step": 438
},
{
"epoch": 3.554655870445344,
"grad_norm": 0.006286781746894121,
"learning_rate": 9.944796926505331e-05,
"loss": 0.0009,
"step": 439
},
{
"epoch": 3.562753036437247,
"grad_norm": 0.00723978690803051,
"learning_rate": 9.943952498804827e-05,
"loss": 0.0012,
"step": 440
},
{
"epoch": 3.57085020242915,
"grad_norm": 0.00893012247979641,
"learning_rate": 9.943101697931875e-05,
"loss": 0.0016,
"step": 441
},
{
"epoch": 3.5789473684210527,
"grad_norm": 0.00894072838127613,
"learning_rate": 9.942244524983232e-05,
"loss": 0.0017,
"step": 442
},
{
"epoch": 3.5870445344129553,
"grad_norm": 0.005625961814075708,
"learning_rate": 9.941380981063864e-05,
"loss": 0.0008,
"step": 443
},
{
"epoch": 3.5951417004048585,
"grad_norm": 0.010527077130973339,
"learning_rate": 9.940511067286952e-05,
"loss": 0.0012,
"step": 444
},
{
"epoch": 3.603238866396761,
"grad_norm": 0.00621502660214901,
"learning_rate": 9.939634784773892e-05,
"loss": 0.0009,
"step": 445
},
{
"epoch": 3.611336032388664,
"grad_norm": 0.012210030108690262,
"learning_rate": 9.938752134654282e-05,
"loss": 0.0014,
"step": 446
},
{
"epoch": 3.619433198380567,
"grad_norm": 0.01024533063173294,
"learning_rate": 9.937863118065932e-05,
"loss": 0.0012,
"step": 447
},
{
"epoch": 3.6275303643724697,
"grad_norm": 0.006288249045610428,
"learning_rate": 9.936967736154864e-05,
"loss": 0.0008,
"step": 448
},
{
"epoch": 3.6356275303643724,
"grad_norm": 0.010536898858845234,
"learning_rate": 9.936065990075296e-05,
"loss": 0.0008,
"step": 449
},
{
"epoch": 3.6437246963562755,
"grad_norm": 0.006477988325059414,
"learning_rate": 9.935157880989658e-05,
"loss": 0.0008,
"step": 450
},
{
"epoch": 3.6437246963562755,
"eval_loss": 0.0018996578874066472,
"eval_runtime": 20.8987,
"eval_samples_per_second": 4.785,
"eval_steps_per_second": 1.196,
"step": 450
},
{
"epoch": 3.651821862348178,
"grad_norm": 0.009040674194693565,
"learning_rate": 9.93424341006858e-05,
"loss": 0.0018,
"step": 451
},
{
"epoch": 3.659919028340081,
"grad_norm": 0.010683619417250156,
"learning_rate": 9.93332257849089e-05,
"loss": 0.0012,
"step": 452
},
{
"epoch": 3.668016194331984,
"grad_norm": 0.010580274276435375,
"learning_rate": 9.932395387443618e-05,
"loss": 0.0012,
"step": 453
},
{
"epoch": 3.6761133603238867,
"grad_norm": 0.01016503106802702,
"learning_rate": 9.931461838121993e-05,
"loss": 0.0013,
"step": 454
},
{
"epoch": 3.6842105263157894,
"grad_norm": 0.006146210245788097,
"learning_rate": 9.930521931729439e-05,
"loss": 0.0006,
"step": 455
},
{
"epoch": 3.6923076923076925,
"grad_norm": 0.0124620720744133,
"learning_rate": 9.929575669477572e-05,
"loss": 0.0015,
"step": 456
},
{
"epoch": 3.700404858299595,
"grad_norm": 0.011394270695745945,
"learning_rate": 9.928623052586207e-05,
"loss": 0.0012,
"step": 457
},
{
"epoch": 3.708502024291498,
"grad_norm": 0.005241707898676395,
"learning_rate": 9.927664082283345e-05,
"loss": 0.0005,
"step": 458
},
{
"epoch": 3.716599190283401,
"grad_norm": 0.00538614671677351,
"learning_rate": 9.926698759805184e-05,
"loss": 0.0006,
"step": 459
},
{
"epoch": 3.7246963562753037,
"grad_norm": 0.011280486360192299,
"learning_rate": 9.925727086396101e-05,
"loss": 0.0015,
"step": 460
},
{
"epoch": 3.7327935222672064,
"grad_norm": 0.006988388951867819,
"learning_rate": 9.924749063308668e-05,
"loss": 0.001,
"step": 461
},
{
"epoch": 3.7408906882591095,
"grad_norm": 0.011907723732292652,
"learning_rate": 9.923764691803639e-05,
"loss": 0.0021,
"step": 462
},
{
"epoch": 3.748987854251012,
"grad_norm": 0.012228334322571754,
"learning_rate": 9.922773973149953e-05,
"loss": 0.0011,
"step": 463
},
{
"epoch": 3.757085020242915,
"grad_norm": 0.00736306793987751,
"learning_rate": 9.921776908624727e-05,
"loss": 0.0013,
"step": 464
},
{
"epoch": 3.765182186234818,
"grad_norm": 0.007194120436906815,
"learning_rate": 9.920773499513266e-05,
"loss": 0.0009,
"step": 465
},
{
"epoch": 3.7732793522267207,
"grad_norm": 0.00893466267734766,
"learning_rate": 9.919763747109043e-05,
"loss": 0.0012,
"step": 466
},
{
"epoch": 3.7813765182186234,
"grad_norm": 0.009740286506712437,
"learning_rate": 9.91874765271372e-05,
"loss": 0.0019,
"step": 467
},
{
"epoch": 3.7894736842105265,
"grad_norm": 0.004781534895300865,
"learning_rate": 9.917725217637126e-05,
"loss": 0.0006,
"step": 468
},
{
"epoch": 3.7975708502024292,
"grad_norm": 0.011936492286622524,
"learning_rate": 9.916696443197267e-05,
"loss": 0.0015,
"step": 469
},
{
"epoch": 3.805668016194332,
"grad_norm": 0.008463852107524872,
"learning_rate": 9.91566133072032e-05,
"loss": 0.0012,
"step": 470
},
{
"epoch": 3.813765182186235,
"grad_norm": 0.008224911987781525,
"learning_rate": 9.914619881540629e-05,
"loss": 0.0011,
"step": 471
},
{
"epoch": 3.8218623481781377,
"grad_norm": 0.00846084114164114,
"learning_rate": 9.913572097000716e-05,
"loss": 0.0011,
"step": 472
},
{
"epoch": 3.8299595141700404,
"grad_norm": 0.006916975602507591,
"learning_rate": 9.912517978451259e-05,
"loss": 0.001,
"step": 473
},
{
"epoch": 3.8380566801619436,
"grad_norm": 0.015935301780700684,
"learning_rate": 9.911457527251109e-05,
"loss": 0.0011,
"step": 474
},
{
"epoch": 3.8461538461538463,
"grad_norm": 0.009782219305634499,
"learning_rate": 9.910390744767275e-05,
"loss": 0.0014,
"step": 475
},
{
"epoch": 3.8461538461538463,
"eval_loss": 0.0019529929850250483,
"eval_runtime": 20.908,
"eval_samples_per_second": 4.783,
"eval_steps_per_second": 1.196,
"step": 475
},
{
"epoch": 3.854251012145749,
"grad_norm": 0.008936782367527485,
"learning_rate": 9.90931763237493e-05,
"loss": 0.0015,
"step": 476
},
{
"epoch": 3.862348178137652,
"grad_norm": 0.006487220525741577,
"learning_rate": 9.908238191457409e-05,
"loss": 0.0011,
"step": 477
},
{
"epoch": 3.8704453441295548,
"grad_norm": 0.011362340301275253,
"learning_rate": 9.907152423406199e-05,
"loss": 0.0019,
"step": 478
},
{
"epoch": 3.8785425101214575,
"grad_norm": 0.011046521365642548,
"learning_rate": 9.906060329620949e-05,
"loss": 0.0017,
"step": 479
},
{
"epoch": 3.8866396761133606,
"grad_norm": 0.005723021924495697,
"learning_rate": 9.904961911509459e-05,
"loss": 0.0007,
"step": 480
},
{
"epoch": 3.8947368421052633,
"grad_norm": 0.004767613019794226,
"learning_rate": 9.903857170487684e-05,
"loss": 0.0006,
"step": 481
},
{
"epoch": 3.902834008097166,
"grad_norm": 0.0070763761177659035,
"learning_rate": 9.902746107979728e-05,
"loss": 0.0008,
"step": 482
},
{
"epoch": 3.910931174089069,
"grad_norm": 0.011897820979356766,
"learning_rate": 9.901628725417843e-05,
"loss": 0.0012,
"step": 483
},
{
"epoch": 3.919028340080972,
"grad_norm": 0.006268302444368601,
"learning_rate": 9.900505024242431e-05,
"loss": 0.0007,
"step": 484
},
{
"epoch": 3.9271255060728745,
"grad_norm": 0.007625557016581297,
"learning_rate": 9.899375005902038e-05,
"loss": 0.0019,
"step": 485
},
{
"epoch": 3.9352226720647776,
"grad_norm": 0.007365250959992409,
"learning_rate": 9.898238671853352e-05,
"loss": 0.001,
"step": 486
},
{
"epoch": 3.9433198380566803,
"grad_norm": 0.007656946778297424,
"learning_rate": 9.897096023561205e-05,
"loss": 0.0009,
"step": 487
},
{
"epoch": 3.951417004048583,
"grad_norm": 0.006735661532729864,
"learning_rate": 9.895947062498566e-05,
"loss": 0.0006,
"step": 488
},
{
"epoch": 3.9595141700404857,
"grad_norm": 0.009474151767790318,
"learning_rate": 9.894791790146542e-05,
"loss": 0.0015,
"step": 489
},
{
"epoch": 3.967611336032389,
"grad_norm": 0.00841295812278986,
"learning_rate": 9.89363020799438e-05,
"loss": 0.0011,
"step": 490
},
{
"epoch": 3.9757085020242915,
"grad_norm": 0.008091527037322521,
"learning_rate": 9.892462317539455e-05,
"loss": 0.0007,
"step": 491
},
{
"epoch": 3.983805668016194,
"grad_norm": 0.01014798879623413,
"learning_rate": 9.891288120287276e-05,
"loss": 0.0011,
"step": 492
},
{
"epoch": 3.9919028340080973,
"grad_norm": 0.008167481981217861,
"learning_rate": 9.890107617751484e-05,
"loss": 0.0008,
"step": 493
},
{
"epoch": 4.0,
"grad_norm": 0.006659403908997774,
"learning_rate": 9.888920811453846e-05,
"loss": 0.0006,
"step": 494
},
{
"epoch": 4.008097165991903,
"grad_norm": 0.0071459268219769,
"learning_rate": 9.887727702924255e-05,
"loss": 0.0008,
"step": 495
},
{
"epoch": 4.016194331983805,
"grad_norm": 0.007468671537935734,
"learning_rate": 9.886528293700729e-05,
"loss": 0.0007,
"step": 496
},
{
"epoch": 4.0242914979757085,
"grad_norm": 0.0065126921981573105,
"learning_rate": 9.885322585329409e-05,
"loss": 0.0005,
"step": 497
},
{
"epoch": 4.032388663967612,
"grad_norm": 0.007633598055690527,
"learning_rate": 9.884110579364552e-05,
"loss": 0.0007,
"step": 498
},
{
"epoch": 4.040485829959514,
"grad_norm": 0.00944494642317295,
"learning_rate": 9.882892277368538e-05,
"loss": 0.0006,
"step": 499
},
{
"epoch": 4.048582995951417,
"grad_norm": 0.011854424141347408,
"learning_rate": 9.881667680911862e-05,
"loss": 0.0009,
"step": 500
},
{
"epoch": 4.048582995951417,
"eval_loss": 0.001951522775925696,
"eval_runtime": 20.8898,
"eval_samples_per_second": 4.787,
"eval_steps_per_second": 1.197,
"step": 500
},
{
"epoch": 4.05668016194332,
"grad_norm": 0.007982923649251461,
"learning_rate": 9.880436791573133e-05,
"loss": 0.001,
"step": 501
},
{
"epoch": 4.064777327935222,
"grad_norm": 0.007158339489251375,
"learning_rate": 9.879199610939067e-05,
"loss": 0.0005,
"step": 502
},
{
"epoch": 4.0728744939271255,
"grad_norm": 0.006214539520442486,
"learning_rate": 9.877956140604498e-05,
"loss": 0.0005,
"step": 503
},
{
"epoch": 4.080971659919029,
"grad_norm": 0.00849990639835596,
"learning_rate": 9.876706382172365e-05,
"loss": 0.0007,
"step": 504
},
{
"epoch": 4.089068825910931,
"grad_norm": 0.00827990472316742,
"learning_rate": 9.87545033725371e-05,
"loss": 0.001,
"step": 505
},
{
"epoch": 4.097165991902834,
"grad_norm": 0.010654132813215256,
"learning_rate": 9.874188007467681e-05,
"loss": 0.0013,
"step": 506
},
{
"epoch": 4.105263157894737,
"grad_norm": 0.00579674681648612,
"learning_rate": 9.872919394441529e-05,
"loss": 0.0005,
"step": 507
},
{
"epoch": 4.113360323886639,
"grad_norm": 0.006263192277401686,
"learning_rate": 9.871644499810601e-05,
"loss": 0.0006,
"step": 508
},
{
"epoch": 4.1214574898785425,
"grad_norm": 0.008381963707506657,
"learning_rate": 9.870363325218349e-05,
"loss": 0.0012,
"step": 509
},
{
"epoch": 4.129554655870446,
"grad_norm": 0.011459080502390862,
"learning_rate": 9.86907587231631e-05,
"loss": 0.0012,
"step": 510
},
{
"epoch": 4.137651821862348,
"grad_norm": 0.004725282080471516,
"learning_rate": 9.867782142764122e-05,
"loss": 0.0006,
"step": 511
},
{
"epoch": 4.145748987854251,
"grad_norm": 0.007089782506227493,
"learning_rate": 9.866482138229511e-05,
"loss": 0.0008,
"step": 512
},
{
"epoch": 4.153846153846154,
"grad_norm": 0.005252163391560316,
"learning_rate": 9.865175860388293e-05,
"loss": 0.0006,
"step": 513
},
{
"epoch": 4.161943319838056,
"grad_norm": 0.006663177162408829,
"learning_rate": 9.86386331092437e-05,
"loss": 0.0007,
"step": 514
},
{
"epoch": 4.17004048582996,
"grad_norm": 0.007172934245318174,
"learning_rate": 9.86254449152973e-05,
"loss": 0.0006,
"step": 515
},
{
"epoch": 4.178137651821863,
"grad_norm": 0.006407279521226883,
"learning_rate": 9.861219403904442e-05,
"loss": 0.0005,
"step": 516
},
{
"epoch": 4.186234817813765,
"grad_norm": 0.007719367276877165,
"learning_rate": 9.859888049756656e-05,
"loss": 0.0008,
"step": 517
},
{
"epoch": 4.194331983805668,
"grad_norm": 0.008337226696312428,
"learning_rate": 9.8585504308026e-05,
"loss": 0.0007,
"step": 518
},
{
"epoch": 4.202429149797571,
"grad_norm": 0.008549238555133343,
"learning_rate": 9.857206548766576e-05,
"loss": 0.0005,
"step": 519
},
{
"epoch": 4.2105263157894735,
"grad_norm": 0.010102051310241222,
"learning_rate": 9.855856405380966e-05,
"loss": 0.0009,
"step": 520
},
{
"epoch": 4.218623481781377,
"grad_norm": 0.005718359258025885,
"learning_rate": 9.854500002386215e-05,
"loss": 0.0006,
"step": 521
},
{
"epoch": 4.22672064777328,
"grad_norm": 0.00954069197177887,
"learning_rate": 9.853137341530842e-05,
"loss": 0.0009,
"step": 522
},
{
"epoch": 4.234817813765182,
"grad_norm": 0.008051340468227863,
"learning_rate": 9.851768424571433e-05,
"loss": 0.0007,
"step": 523
},
{
"epoch": 4.242914979757085,
"grad_norm": 0.007244836073368788,
"learning_rate": 9.850393253272637e-05,
"loss": 0.0008,
"step": 524
},
{
"epoch": 4.251012145748988,
"grad_norm": 0.00602039834484458,
"learning_rate": 9.849011829407166e-05,
"loss": 0.0006,
"step": 525
},
{
"epoch": 4.251012145748988,
"eval_loss": 0.001924480078741908,
"eval_runtime": 20.9917,
"eval_samples_per_second": 4.764,
"eval_steps_per_second": 1.191,
"step": 525
},
{
"epoch": 4.2591093117408905,
"grad_norm": 0.007480318192392588,
"learning_rate": 9.84762415475579e-05,
"loss": 0.001,
"step": 526
},
{
"epoch": 4.267206477732794,
"grad_norm": 0.008988871239125729,
"learning_rate": 9.846230231107343e-05,
"loss": 0.0008,
"step": 527
},
{
"epoch": 4.275303643724697,
"grad_norm": 0.00547422282397747,
"learning_rate": 9.844830060258707e-05,
"loss": 0.0007,
"step": 528
},
{
"epoch": 4.283400809716599,
"grad_norm": 0.004437068942934275,
"learning_rate": 9.843423644014822e-05,
"loss": 0.0006,
"step": 529
},
{
"epoch": 4.291497975708502,
"grad_norm": 0.0038599406834691763,
"learning_rate": 9.842010984188676e-05,
"loss": 0.0004,
"step": 530
},
{
"epoch": 4.299595141700405,
"grad_norm": 0.0072926743887364864,
"learning_rate": 9.840592082601309e-05,
"loss": 0.0007,
"step": 531
},
{
"epoch": 4.3076923076923075,
"grad_norm": 0.006621995009481907,
"learning_rate": 9.839166941081804e-05,
"loss": 0.0011,
"step": 532
},
{
"epoch": 4.315789473684211,
"grad_norm": 0.008673852309584618,
"learning_rate": 9.837735561467288e-05,
"loss": 0.0012,
"step": 533
},
{
"epoch": 4.323886639676114,
"grad_norm": 0.004889126401394606,
"learning_rate": 9.836297945602931e-05,
"loss": 0.0005,
"step": 534
},
{
"epoch": 4.331983805668016,
"grad_norm": 0.010923854075372219,
"learning_rate": 9.83485409534194e-05,
"loss": 0.0013,
"step": 535
},
{
"epoch": 4.340080971659919,
"grad_norm": 0.013469451107084751,
"learning_rate": 9.833404012545562e-05,
"loss": 0.001,
"step": 536
},
{
"epoch": 4.348178137651822,
"grad_norm": 0.004695568699389696,
"learning_rate": 9.831947699083076e-05,
"loss": 0.0004,
"step": 537
},
{
"epoch": 4.3562753036437245,
"grad_norm": 0.005549240857362747,
"learning_rate": 9.830485156831792e-05,
"loss": 0.0004,
"step": 538
},
{
"epoch": 4.364372469635628,
"grad_norm": 0.004222598858177662,
"learning_rate": 9.829016387677051e-05,
"loss": 0.0004,
"step": 539
},
{
"epoch": 4.372469635627531,
"grad_norm": 0.0063895429484546185,
"learning_rate": 9.827541393512221e-05,
"loss": 0.0007,
"step": 540
},
{
"epoch": 4.380566801619433,
"grad_norm": 0.006316315848380327,
"learning_rate": 9.826060176238693e-05,
"loss": 0.0004,
"step": 541
},
{
"epoch": 4.388663967611336,
"grad_norm": 0.004954824224114418,
"learning_rate": 9.824572737765883e-05,
"loss": 0.0004,
"step": 542
},
{
"epoch": 4.396761133603239,
"grad_norm": 0.005867576692253351,
"learning_rate": 9.823079080011222e-05,
"loss": 0.0006,
"step": 543
},
{
"epoch": 4.4048582995951415,
"grad_norm": 0.004107177723199129,
"learning_rate": 9.821579204900164e-05,
"loss": 0.0003,
"step": 544
},
{
"epoch": 4.412955465587045,
"grad_norm": 0.008971895091235638,
"learning_rate": 9.820073114366173e-05,
"loss": 0.001,
"step": 545
},
{
"epoch": 4.421052631578947,
"grad_norm": 0.004289721138775349,
"learning_rate": 9.818560810350727e-05,
"loss": 0.0004,
"step": 546
},
{
"epoch": 4.42914979757085,
"grad_norm": 0.011725863441824913,
"learning_rate": 9.817042294803314e-05,
"loss": 0.0007,
"step": 547
},
{
"epoch": 4.437246963562753,
"grad_norm": 0.007441469002515078,
"learning_rate": 9.81551756968143e-05,
"loss": 0.0007,
"step": 548
},
{
"epoch": 4.445344129554655,
"grad_norm": 0.009847107343375683,
"learning_rate": 9.813986636950572e-05,
"loss": 0.0012,
"step": 549
},
{
"epoch": 4.4534412955465585,
"grad_norm": 0.008748643100261688,
"learning_rate": 9.812449498584245e-05,
"loss": 0.0005,
"step": 550
},
{
"epoch": 4.4534412955465585,
"eval_loss": 0.0018015182577073574,
"eval_runtime": 20.9424,
"eval_samples_per_second": 4.775,
"eval_steps_per_second": 1.194,
"step": 550
},
{
"epoch": 4.461538461538462,
"grad_norm": 0.007471165154129267,
"learning_rate": 9.810906156563946e-05,
"loss": 0.0006,
"step": 551
},
{
"epoch": 4.469635627530364,
"grad_norm": 0.008223461918532848,
"learning_rate": 9.809356612879175e-05,
"loss": 0.0008,
"step": 552
},
{
"epoch": 4.477732793522267,
"grad_norm": 0.006764471530914307,
"learning_rate": 9.807800869527426e-05,
"loss": 0.0005,
"step": 553
},
{
"epoch": 4.48582995951417,
"grad_norm": 0.01072748377919197,
"learning_rate": 9.806238928514184e-05,
"loss": 0.0012,
"step": 554
},
{
"epoch": 4.493927125506072,
"grad_norm": 0.011665924452245235,
"learning_rate": 9.80467079185292e-05,
"loss": 0.0006,
"step": 555
},
{
"epoch": 4.502024291497976,
"grad_norm": 0.012857378460466862,
"learning_rate": 9.803096461565098e-05,
"loss": 0.0005,
"step": 556
},
{
"epoch": 4.510121457489879,
"grad_norm": 0.007671073079109192,
"learning_rate": 9.801515939680159e-05,
"loss": 0.0006,
"step": 557
},
{
"epoch": 4.518218623481781,
"grad_norm": 0.008667264133691788,
"learning_rate": 9.799929228235532e-05,
"loss": 0.0009,
"step": 558
},
{
"epoch": 4.526315789473684,
"grad_norm": 0.009722660295665264,
"learning_rate": 9.798336329276623e-05,
"loss": 0.0009,
"step": 559
},
{
"epoch": 4.534412955465587,
"grad_norm": 0.00602738605812192,
"learning_rate": 9.796737244856811e-05,
"loss": 0.0006,
"step": 560
},
{
"epoch": 4.5425101214574894,
"grad_norm": 0.006649456452578306,
"learning_rate": 9.795131977037451e-05,
"loss": 0.0006,
"step": 561
},
{
"epoch": 4.550607287449393,
"grad_norm": 0.009024589322507381,
"learning_rate": 9.79352052788787e-05,
"loss": 0.0006,
"step": 562
},
{
"epoch": 4.558704453441296,
"grad_norm": 0.0069068074226379395,
"learning_rate": 9.79190289948536e-05,
"loss": 0.0009,
"step": 563
},
{
"epoch": 4.566801619433198,
"grad_norm": 0.007059336174279451,
"learning_rate": 9.790279093915183e-05,
"loss": 0.0008,
"step": 564
},
{
"epoch": 4.574898785425101,
"grad_norm": 0.005149452481418848,
"learning_rate": 9.788649113270562e-05,
"loss": 0.0005,
"step": 565
},
{
"epoch": 4.582995951417004,
"grad_norm": 0.008085817098617554,
"learning_rate": 9.787012959652677e-05,
"loss": 0.0009,
"step": 566
},
{
"epoch": 4.5910931174089065,
"grad_norm": 0.005893132649362087,
"learning_rate": 9.785370635170671e-05,
"loss": 0.0008,
"step": 567
},
{
"epoch": 4.59919028340081,
"grad_norm": 0.006068812217563391,
"learning_rate": 9.783722141941636e-05,
"loss": 0.0007,
"step": 568
},
{
"epoch": 4.607287449392713,
"grad_norm": 0.00651150569319725,
"learning_rate": 9.782067482090624e-05,
"loss": 0.0006,
"step": 569
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.008106587454676628,
"learning_rate": 9.780406657750626e-05,
"loss": 0.001,
"step": 570
},
{
"epoch": 4.623481781376518,
"grad_norm": 0.00984260905534029,
"learning_rate": 9.778739671062586e-05,
"loss": 0.0007,
"step": 571
},
{
"epoch": 4.631578947368421,
"grad_norm": 0.011000724509358406,
"learning_rate": 9.777066524175394e-05,
"loss": 0.001,
"step": 572
},
{
"epoch": 4.6396761133603235,
"grad_norm": 0.00920114666223526,
"learning_rate": 9.775387219245876e-05,
"loss": 0.0012,
"step": 573
},
{
"epoch": 4.647773279352227,
"grad_norm": 0.008547363802790642,
"learning_rate": 9.773701758438796e-05,
"loss": 0.0008,
"step": 574
},
{
"epoch": 4.65587044534413,
"grad_norm": 0.011154532432556152,
"learning_rate": 9.772010143926856e-05,
"loss": 0.0007,
"step": 575
},
{
"epoch": 4.65587044534413,
"eval_loss": 0.0017410250147804618,
"eval_runtime": 20.8952,
"eval_samples_per_second": 4.786,
"eval_steps_per_second": 1.196,
"step": 575
},
{
"epoch": 4.663967611336032,
"grad_norm": 0.008925629779696465,
"learning_rate": 9.77031237789069e-05,
"loss": 0.0009,
"step": 576
},
{
"epoch": 4.672064777327935,
"grad_norm": 0.006519634742289782,
"learning_rate": 9.768608462518865e-05,
"loss": 0.0007,
"step": 577
},
{
"epoch": 4.680161943319838,
"grad_norm": 0.0046993098221719265,
"learning_rate": 9.766898400007869e-05,
"loss": 0.0004,
"step": 578
},
{
"epoch": 4.6882591093117405,
"grad_norm": 0.011550773866474628,
"learning_rate": 9.765182192562117e-05,
"loss": 0.0007,
"step": 579
},
{
"epoch": 4.696356275303644,
"grad_norm": 0.008509055711328983,
"learning_rate": 9.763459842393945e-05,
"loss": 0.0007,
"step": 580
},
{
"epoch": 4.704453441295547,
"grad_norm": 0.009534220211207867,
"learning_rate": 9.76173135172361e-05,
"loss": 0.0012,
"step": 581
},
{
"epoch": 4.712550607287449,
"grad_norm": 0.008667496033012867,
"learning_rate": 9.759996722779281e-05,
"loss": 0.0006,
"step": 582
},
{
"epoch": 4.720647773279352,
"grad_norm": 0.004869160708039999,
"learning_rate": 9.758255957797042e-05,
"loss": 0.0004,
"step": 583
},
{
"epoch": 4.728744939271255,
"grad_norm": 0.007150961086153984,
"learning_rate": 9.756509059020884e-05,
"loss": 0.0006,
"step": 584
},
{
"epoch": 4.7368421052631575,
"grad_norm": 0.008666688576340675,
"learning_rate": 9.75475602870271e-05,
"loss": 0.0008,
"step": 585
},
{
"epoch": 4.744939271255061,
"grad_norm": 0.006314212456345558,
"learning_rate": 9.752996869102322e-05,
"loss": 0.001,
"step": 586
},
{
"epoch": 4.753036437246964,
"grad_norm": 0.007860385812819004,
"learning_rate": 9.751231582487428e-05,
"loss": 0.0007,
"step": 587
},
{
"epoch": 4.761133603238866,
"grad_norm": 0.007355378940701485,
"learning_rate": 9.749460171133629e-05,
"loss": 0.001,
"step": 588
},
{
"epoch": 4.769230769230769,
"grad_norm": 0.008818419650197029,
"learning_rate": 9.747682637324425e-05,
"loss": 0.0007,
"step": 589
},
{
"epoch": 4.777327935222672,
"grad_norm": 0.008113800548017025,
"learning_rate": 9.745898983351204e-05,
"loss": 0.0008,
"step": 590
},
{
"epoch": 4.7854251012145745,
"grad_norm": 0.006416656076908112,
"learning_rate": 9.744109211513253e-05,
"loss": 0.0009,
"step": 591
},
{
"epoch": 4.793522267206478,
"grad_norm": 0.009136557579040527,
"learning_rate": 9.742313324117736e-05,
"loss": 0.0006,
"step": 592
},
{
"epoch": 4.801619433198381,
"grad_norm": 0.008266017772257328,
"learning_rate": 9.740511323479702e-05,
"loss": 0.0012,
"step": 593
},
{
"epoch": 4.809716599190283,
"grad_norm": 0.005936949979513884,
"learning_rate": 9.738703211922084e-05,
"loss": 0.0008,
"step": 594
},
{
"epoch": 4.817813765182186,
"grad_norm": 0.00802876427769661,
"learning_rate": 9.736888991775688e-05,
"loss": 0.0006,
"step": 595
},
{
"epoch": 4.825910931174089,
"grad_norm": 0.006083488930016756,
"learning_rate": 9.735068665379201e-05,
"loss": 0.0008,
"step": 596
},
{
"epoch": 4.834008097165992,
"grad_norm": 0.0064203874208033085,
"learning_rate": 9.733242235079175e-05,
"loss": 0.0008,
"step": 597
},
{
"epoch": 4.842105263157895,
"grad_norm": 0.0068838950246572495,
"learning_rate": 9.731409703230035e-05,
"loss": 0.0006,
"step": 598
},
{
"epoch": 4.850202429149798,
"grad_norm": 0.006466129794716835,
"learning_rate": 9.729571072194066e-05,
"loss": 0.0005,
"step": 599
},
{
"epoch": 4.8582995951417,
"grad_norm": 0.004359446931630373,
"learning_rate": 9.727726344341419e-05,
"loss": 0.0004,
"step": 600
},
{
"epoch": 4.8582995951417,
"eval_loss": 0.0015752798644825816,
"eval_runtime": 20.8835,
"eval_samples_per_second": 4.788,
"eval_steps_per_second": 1.197,
"step": 600
},
{
"epoch": 4.866396761133603,
"grad_norm": 0.008679832331836224,
"learning_rate": 9.725875522050107e-05,
"loss": 0.0012,
"step": 601
},
{
"epoch": 4.874493927125506,
"grad_norm": 0.0073195514269173145,
"learning_rate": 9.724018607705995e-05,
"loss": 0.0013,
"step": 602
},
{
"epoch": 4.882591093117409,
"grad_norm": 0.00781306903809309,
"learning_rate": 9.722155603702804e-05,
"loss": 0.0008,
"step": 603
},
{
"epoch": 4.890688259109312,
"grad_norm": 0.009774848818778992,
"learning_rate": 9.7202865124421e-05,
"loss": 0.0006,
"step": 604
},
{
"epoch": 4.898785425101215,
"grad_norm": 0.006310292985290289,
"learning_rate": 9.718411336333301e-05,
"loss": 0.0006,
"step": 605
},
{
"epoch": 4.906882591093117,
"grad_norm": 0.00883979070931673,
"learning_rate": 9.71653007779367e-05,
"loss": 0.0014,
"step": 606
},
{
"epoch": 4.91497975708502,
"grad_norm": 0.008740267716348171,
"learning_rate": 9.714642739248305e-05,
"loss": 0.0008,
"step": 607
},
{
"epoch": 4.923076923076923,
"grad_norm": 0.003794717602431774,
"learning_rate": 9.712749323130146e-05,
"loss": 0.0004,
"step": 608
},
{
"epoch": 4.931174089068826,
"grad_norm": 0.007664462551474571,
"learning_rate": 9.710849831879967e-05,
"loss": 0.0009,
"step": 609
},
{
"epoch": 4.939271255060729,
"grad_norm": 0.007098712492734194,
"learning_rate": 9.708944267946369e-05,
"loss": 0.0007,
"step": 610
},
{
"epoch": 4.947368421052632,
"grad_norm": 0.004916313569992781,
"learning_rate": 9.70703263378579e-05,
"loss": 0.0008,
"step": 611
},
{
"epoch": 4.955465587044534,
"grad_norm": 0.006048915442079306,
"learning_rate": 9.705114931862486e-05,
"loss": 0.0009,
"step": 612
},
{
"epoch": 4.963562753036437,
"grad_norm": 0.007562866434454918,
"learning_rate": 9.703191164648537e-05,
"loss": 0.0006,
"step": 613
},
{
"epoch": 4.97165991902834,
"grad_norm": 0.006720076780766249,
"learning_rate": 9.70126133462384e-05,
"loss": 0.0007,
"step": 614
},
{
"epoch": 4.979757085020243,
"grad_norm": 0.005906874779611826,
"learning_rate": 9.699325444276109e-05,
"loss": 0.0004,
"step": 615
},
{
"epoch": 4.987854251012146,
"grad_norm": 0.004341105464845896,
"learning_rate": 9.697383496100872e-05,
"loss": 0.0004,
"step": 616
},
{
"epoch": 4.995951417004049,
"grad_norm": 0.005335621070116758,
"learning_rate": 9.695435492601464e-05,
"loss": 0.0004,
"step": 617
},
{
"epoch": 5.004048582995951,
"grad_norm": 0.010734565556049347,
"learning_rate": 9.693481436289025e-05,
"loss": 0.0014,
"step": 618
},
{
"epoch": 5.012145748987854,
"grad_norm": 0.0031302126590162516,
"learning_rate": 9.691521329682499e-05,
"loss": 0.0003,
"step": 619
},
{
"epoch": 5.020242914979757,
"grad_norm": 0.006032292731106281,
"learning_rate": 9.68955517530863e-05,
"loss": 0.0004,
"step": 620
},
{
"epoch": 5.02834008097166,
"grad_norm": 0.005305845756083727,
"learning_rate": 9.687582975701956e-05,
"loss": 0.0005,
"step": 621
},
{
"epoch": 5.036437246963563,
"grad_norm": 0.008457905612885952,
"learning_rate": 9.685604733404808e-05,
"loss": 0.0004,
"step": 622
},
{
"epoch": 5.044534412955466,
"grad_norm": 0.00620026420801878,
"learning_rate": 9.68362045096731e-05,
"loss": 0.0004,
"step": 623
},
{
"epoch": 5.052631578947368,
"grad_norm": 0.008932286873459816,
"learning_rate": 9.681630130947367e-05,
"loss": 0.0003,
"step": 624
},
{
"epoch": 5.060728744939271,
"grad_norm": 0.007199987303465605,
"learning_rate": 9.679633775910672e-05,
"loss": 0.0004,
"step": 625
},
{
"epoch": 5.060728744939271,
"eval_loss": 0.0016215384239330888,
"eval_runtime": 20.8859,
"eval_samples_per_second": 4.788,
"eval_steps_per_second": 1.197,
"step": 625
},
{
"epoch": 5.068825910931174,
"grad_norm": 0.0068572270683944225,
"learning_rate": 9.677631388430694e-05,
"loss": 0.0004,
"step": 626
},
{
"epoch": 5.076923076923077,
"grad_norm": 0.007633959408849478,
"learning_rate": 9.675622971088681e-05,
"loss": 0.0006,
"step": 627
},
{
"epoch": 5.08502024291498,
"grad_norm": 0.005319299641996622,
"learning_rate": 9.673608526473649e-05,
"loss": 0.0007,
"step": 628
},
{
"epoch": 5.093117408906883,
"grad_norm": 0.003859872929751873,
"learning_rate": 9.671588057182391e-05,
"loss": 0.0003,
"step": 629
},
{
"epoch": 5.101214574898785,
"grad_norm": 0.007842942140996456,
"learning_rate": 9.669561565819463e-05,
"loss": 0.0004,
"step": 630
},
{
"epoch": 5.109311740890688,
"grad_norm": 0.007960239425301552,
"learning_rate": 9.66752905499718e-05,
"loss": 0.0004,
"step": 631
},
{
"epoch": 5.117408906882591,
"grad_norm": 0.0059306202456355095,
"learning_rate": 9.665490527335622e-05,
"loss": 0.0004,
"step": 632
},
{
"epoch": 5.125506072874494,
"grad_norm": 0.006550755817443132,
"learning_rate": 9.663445985462624e-05,
"loss": 0.0006,
"step": 633
},
{
"epoch": 5.133603238866397,
"grad_norm": 0.009190104901790619,
"learning_rate": 9.661395432013773e-05,
"loss": 0.0006,
"step": 634
},
{
"epoch": 5.1417004048583,
"grad_norm": 0.0046551162376999855,
"learning_rate": 9.659338869632406e-05,
"loss": 0.0005,
"step": 635
},
{
"epoch": 5.149797570850202,
"grad_norm": 0.0077586546540260315,
"learning_rate": 9.657276300969604e-05,
"loss": 0.0005,
"step": 636
},
{
"epoch": 5.157894736842105,
"grad_norm": 0.009180366061627865,
"learning_rate": 9.655207728684194e-05,
"loss": 0.0007,
"step": 637
},
{
"epoch": 5.165991902834008,
"grad_norm": 0.003306223312392831,
"learning_rate": 9.65313315544274e-05,
"loss": 0.0003,
"step": 638
},
{
"epoch": 5.174089068825911,
"grad_norm": 0.0038234649691730738,
"learning_rate": 9.65105258391954e-05,
"loss": 0.0004,
"step": 639
},
{
"epoch": 5.182186234817814,
"grad_norm": 0.00610633147880435,
"learning_rate": 9.64896601679663e-05,
"loss": 0.0005,
"step": 640
},
{
"epoch": 5.190283400809717,
"grad_norm": 0.004961833823472261,
"learning_rate": 9.64687345676377e-05,
"loss": 0.0005,
"step": 641
},
{
"epoch": 5.198380566801619,
"grad_norm": 0.004935056436806917,
"learning_rate": 9.644774906518445e-05,
"loss": 0.0006,
"step": 642
},
{
"epoch": 5.206477732793522,
"grad_norm": 0.005315741058439016,
"learning_rate": 9.642670368765865e-05,
"loss": 0.0006,
"step": 643
},
{
"epoch": 5.2145748987854255,
"grad_norm": 0.0038190498016774654,
"learning_rate": 9.640559846218958e-05,
"loss": 0.0004,
"step": 644
},
{
"epoch": 5.222672064777328,
"grad_norm": 0.005875582341104746,
"learning_rate": 9.638443341598364e-05,
"loss": 0.0004,
"step": 645
},
{
"epoch": 5.230769230769231,
"grad_norm": 0.006707495544105768,
"learning_rate": 9.636320857632437e-05,
"loss": 0.0005,
"step": 646
},
{
"epoch": 5.238866396761134,
"grad_norm": 0.0063120415434241295,
"learning_rate": 9.634192397057238e-05,
"loss": 0.0005,
"step": 647
},
{
"epoch": 5.246963562753036,
"grad_norm": 0.009964760392904282,
"learning_rate": 9.632057962616531e-05,
"loss": 0.0008,
"step": 648
},
{
"epoch": 5.255060728744939,
"grad_norm": 0.005848821718245745,
"learning_rate": 9.629917557061787e-05,
"loss": 0.0005,
"step": 649
},
{
"epoch": 5.2631578947368425,
"grad_norm": 0.004280323162674904,
"learning_rate": 9.627771183152164e-05,
"loss": 0.0004,
"step": 650
},
{
"epoch": 5.2631578947368425,
"eval_loss": 0.0015275988262146711,
"eval_runtime": 20.8893,
"eval_samples_per_second": 4.787,
"eval_steps_per_second": 1.197,
"step": 650
},
{
"epoch": 5.271255060728745,
"grad_norm": 0.004903740715235472,
"learning_rate": 9.625618843654523e-05,
"loss": 0.0005,
"step": 651
},
{
"epoch": 5.279352226720648,
"grad_norm": 0.005046170204877853,
"learning_rate": 9.62346054134341e-05,
"loss": 0.0005,
"step": 652
},
{
"epoch": 5.287449392712551,
"grad_norm": 0.005617137067019939,
"learning_rate": 9.621296279001059e-05,
"loss": 0.0004,
"step": 653
},
{
"epoch": 5.295546558704453,
"grad_norm": 0.004350410774350166,
"learning_rate": 9.619126059417387e-05,
"loss": 0.0004,
"step": 654
},
{
"epoch": 5.303643724696356,
"grad_norm": 0.005870455875992775,
"learning_rate": 9.616949885389991e-05,
"loss": 0.0005,
"step": 655
},
{
"epoch": 5.3117408906882595,
"grad_norm": 0.009995516389608383,
"learning_rate": 9.614767759724143e-05,
"loss": 0.0007,
"step": 656
},
{
"epoch": 5.319838056680162,
"grad_norm": 0.002616090467199683,
"learning_rate": 9.612579685232788e-05,
"loss": 0.0003,
"step": 657
},
{
"epoch": 5.327935222672065,
"grad_norm": 0.005312880035489798,
"learning_rate": 9.610385664736536e-05,
"loss": 0.0005,
"step": 658
},
{
"epoch": 5.336032388663968,
"grad_norm": 0.006149233318865299,
"learning_rate": 9.60818570106367e-05,
"loss": 0.0004,
"step": 659
},
{
"epoch": 5.34412955465587,
"grad_norm": 0.005688710603863001,
"learning_rate": 9.605979797050124e-05,
"loss": 0.0006,
"step": 660
},
{
"epoch": 5.352226720647773,
"grad_norm": 0.00494812149554491,
"learning_rate": 9.603767955539495e-05,
"loss": 0.0004,
"step": 661
},
{
"epoch": 5.3603238866396765,
"grad_norm": 0.004429470282047987,
"learning_rate": 9.601550179383036e-05,
"loss": 0.0006,
"step": 662
},
{
"epoch": 5.368421052631579,
"grad_norm": 0.006522475741803646,
"learning_rate": 9.599326471439647e-05,
"loss": 0.0005,
"step": 663
},
{
"epoch": 5.376518218623482,
"grad_norm": 0.005521293263882399,
"learning_rate": 9.597096834575877e-05,
"loss": 0.0005,
"step": 664
},
{
"epoch": 5.384615384615385,
"grad_norm": 0.0048112692311406136,
"learning_rate": 9.594861271665912e-05,
"loss": 0.0004,
"step": 665
},
{
"epoch": 5.392712550607287,
"grad_norm": 0.007786073721945286,
"learning_rate": 9.592619785591586e-05,
"loss": 0.0005,
"step": 666
},
{
"epoch": 5.40080971659919,
"grad_norm": 0.007375451736152172,
"learning_rate": 9.59037237924236e-05,
"loss": 0.0005,
"step": 667
},
{
"epoch": 5.4089068825910935,
"grad_norm": 0.008655287325382233,
"learning_rate": 9.588119055515333e-05,
"loss": 0.0005,
"step": 668
},
{
"epoch": 5.417004048582996,
"grad_norm": 0.002889828523620963,
"learning_rate": 9.58585981731523e-05,
"loss": 0.0003,
"step": 669
},
{
"epoch": 5.425101214574899,
"grad_norm": 0.00909927673637867,
"learning_rate": 9.583594667554399e-05,
"loss": 0.0006,
"step": 670
},
{
"epoch": 5.433198380566802,
"grad_norm": 0.003937150351703167,
"learning_rate": 9.581323609152808e-05,
"loss": 0.0003,
"step": 671
},
{
"epoch": 5.441295546558704,
"grad_norm": 0.010027210228145123,
"learning_rate": 9.579046645038047e-05,
"loss": 0.0009,
"step": 672
},
{
"epoch": 5.449392712550607,
"grad_norm": 0.006260544527322054,
"learning_rate": 9.576763778145312e-05,
"loss": 0.0005,
"step": 673
},
{
"epoch": 5.4574898785425106,
"grad_norm": 0.0034802183508872986,
"learning_rate": 9.574475011417411e-05,
"loss": 0.0004,
"step": 674
},
{
"epoch": 5.465587044534413,
"grad_norm": 0.004040226805955172,
"learning_rate": 9.57218034780476e-05,
"loss": 0.0004,
"step": 675
},
{
"epoch": 5.465587044534413,
"eval_loss": 0.0016189313028007746,
"eval_runtime": 20.8859,
"eval_samples_per_second": 4.788,
"eval_steps_per_second": 1.197,
"step": 675
},
{
"epoch": 5.473684210526316,
"grad_norm": 0.0035269635263830423,
"learning_rate": 9.569879790265373e-05,
"loss": 0.0004,
"step": 676
},
{
"epoch": 5.481781376518219,
"grad_norm": 0.005436853971332312,
"learning_rate": 9.567573341764862e-05,
"loss": 0.0003,
"step": 677
},
{
"epoch": 5.489878542510121,
"grad_norm": 0.007649141363799572,
"learning_rate": 9.565261005276435e-05,
"loss": 0.0007,
"step": 678
},
{
"epoch": 5.497975708502024,
"grad_norm": 0.006564602255821228,
"learning_rate": 9.562942783780891e-05,
"loss": 0.0006,
"step": 679
},
{
"epoch": 5.506072874493928,
"grad_norm": 0.005341153126209974,
"learning_rate": 9.560618680266609e-05,
"loss": 0.0003,
"step": 680
},
{
"epoch": 5.51417004048583,
"grad_norm": 0.005836586467921734,
"learning_rate": 9.558288697729559e-05,
"loss": 0.0006,
"step": 681
},
{
"epoch": 5.522267206477733,
"grad_norm": 0.009045450948178768,
"learning_rate": 9.555952839173282e-05,
"loss": 0.0008,
"step": 682
},
{
"epoch": 5.530364372469636,
"grad_norm": 0.0071211401373147964,
"learning_rate": 9.5536111076089e-05,
"loss": 0.0004,
"step": 683
},
{
"epoch": 5.538461538461538,
"grad_norm": 0.008516835980117321,
"learning_rate": 9.5512635060551e-05,
"loss": 0.0008,
"step": 684
},
{
"epoch": 5.5465587044534415,
"grad_norm": 0.0046457210555672646,
"learning_rate": 9.548910037538141e-05,
"loss": 0.0003,
"step": 685
},
{
"epoch": 5.554655870445345,
"grad_norm": 0.006752189248800278,
"learning_rate": 9.546550705091842e-05,
"loss": 0.0005,
"step": 686
},
{
"epoch": 5.562753036437247,
"grad_norm": 0.002872879384085536,
"learning_rate": 9.544185511757581e-05,
"loss": 0.0003,
"step": 687
},
{
"epoch": 5.57085020242915,
"grad_norm": 0.009211295284330845,
"learning_rate": 9.541814460584293e-05,
"loss": 0.0005,
"step": 688
},
{
"epoch": 5.578947368421053,
"grad_norm": 0.0036671042907983065,
"learning_rate": 9.539437554628464e-05,
"loss": 0.0004,
"step": 689
},
{
"epoch": 5.587044534412955,
"grad_norm": 0.005291562993079424,
"learning_rate": 9.537054796954123e-05,
"loss": 0.0003,
"step": 690
},
{
"epoch": 5.5951417004048585,
"grad_norm": 0.0069409445859491825,
"learning_rate": 9.53466619063285e-05,
"loss": 0.0006,
"step": 691
},
{
"epoch": 5.603238866396762,
"grad_norm": 0.004893122706562281,
"learning_rate": 9.53227173874376e-05,
"loss": 0.0003,
"step": 692
},
{
"epoch": 5.611336032388664,
"grad_norm": 0.0035761839244514704,
"learning_rate": 9.529871444373502e-05,
"loss": 0.0004,
"step": 693
},
{
"epoch": 5.619433198380567,
"grad_norm": 0.005408015567809343,
"learning_rate": 9.527465310616259e-05,
"loss": 0.0004,
"step": 694
},
{
"epoch": 5.62753036437247,
"grad_norm": 0.007360650692135096,
"learning_rate": 9.52505334057374e-05,
"loss": 0.0008,
"step": 695
},
{
"epoch": 5.635627530364372,
"grad_norm": 0.004153635818511248,
"learning_rate": 9.522635537355178e-05,
"loss": 0.0004,
"step": 696
},
{
"epoch": 5.6437246963562755,
"grad_norm": 0.004713242873549461,
"learning_rate": 9.520211904077328e-05,
"loss": 0.0005,
"step": 697
},
{
"epoch": 5.651821862348179,
"grad_norm": 0.00541323609650135,
"learning_rate": 9.517782443864455e-05,
"loss": 0.0005,
"step": 698
},
{
"epoch": 5.659919028340081,
"grad_norm": 0.007091245148330927,
"learning_rate": 9.51534715984834e-05,
"loss": 0.0007,
"step": 699
},
{
"epoch": 5.668016194331984,
"grad_norm": 0.004219442140311003,
"learning_rate": 9.512906055168269e-05,
"loss": 0.0005,
"step": 700
},
{
"epoch": 5.668016194331984,
"eval_loss": 0.0014782178914174438,
"eval_runtime": 20.8959,
"eval_samples_per_second": 4.786,
"eval_steps_per_second": 1.196,
"step": 700
},
{
"epoch": 5.676113360323887,
"grad_norm": 0.003051872830837965,
"learning_rate": 9.510459132971035e-05,
"loss": 0.0004,
"step": 701
},
{
"epoch": 5.684210526315789,
"grad_norm": 0.0043883356265723705,
"learning_rate": 9.508006396410923e-05,
"loss": 0.0003,
"step": 702
},
{
"epoch": 5.6923076923076925,
"grad_norm": 0.0058163790963590145,
"learning_rate": 9.505547848649721e-05,
"loss": 0.0005,
"step": 703
},
{
"epoch": 5.700404858299595,
"grad_norm": 0.00913459062576294,
"learning_rate": 9.503083492856704e-05,
"loss": 0.0008,
"step": 704
},
{
"epoch": 5.708502024291498,
"grad_norm": 0.007107607554644346,
"learning_rate": 9.500613332208634e-05,
"loss": 0.0006,
"step": 705
},
{
"epoch": 5.716599190283401,
"grad_norm": 0.002083905041217804,
"learning_rate": 9.498137369889757e-05,
"loss": 0.0002,
"step": 706
},
{
"epoch": 5.724696356275303,
"grad_norm": 0.0058279503136873245,
"learning_rate": 9.495655609091799e-05,
"loss": 0.0005,
"step": 707
},
{
"epoch": 5.732793522267206,
"grad_norm": 0.002947121160104871,
"learning_rate": 9.493168053013957e-05,
"loss": 0.0003,
"step": 708
},
{
"epoch": 5.7408906882591095,
"grad_norm": 0.0052326153963804245,
"learning_rate": 9.490674704862901e-05,
"loss": 0.0003,
"step": 709
},
{
"epoch": 5.748987854251012,
"grad_norm": 0.0074744438752532005,
"learning_rate": 9.48817556785277e-05,
"loss": 0.0006,
"step": 710
},
{
"epoch": 5.757085020242915,
"grad_norm": 0.007500396575778723,
"learning_rate": 9.485670645205163e-05,
"loss": 0.0004,
"step": 711
},
{
"epoch": 5.765182186234818,
"grad_norm": 0.006873182021081448,
"learning_rate": 9.483159940149132e-05,
"loss": 0.0006,
"step": 712
},
{
"epoch": 5.77327935222672,
"grad_norm": 0.007117138244211674,
"learning_rate": 9.480643455921194e-05,
"loss": 0.0005,
"step": 713
},
{
"epoch": 5.781376518218623,
"grad_norm": 0.0046636732295155525,
"learning_rate": 9.478121195765303e-05,
"loss": 0.0004,
"step": 714
},
{
"epoch": 5.7894736842105265,
"grad_norm": 0.006344164256006479,
"learning_rate": 9.475593162932872e-05,
"loss": 0.0006,
"step": 715
},
{
"epoch": 5.797570850202429,
"grad_norm": 0.007509822491556406,
"learning_rate": 9.473059360682747e-05,
"loss": 0.0004,
"step": 716
},
{
"epoch": 5.805668016194332,
"grad_norm": 0.0036457066889852285,
"learning_rate": 9.47051979228121e-05,
"loss": 0.0004,
"step": 717
},
{
"epoch": 5.813765182186235,
"grad_norm": 0.0034911236725747585,
"learning_rate": 9.467974461001982e-05,
"loss": 0.0003,
"step": 718
},
{
"epoch": 5.821862348178137,
"grad_norm": 0.006103496067225933,
"learning_rate": 9.465423370126212e-05,
"loss": 0.0005,
"step": 719
},
{
"epoch": 5.82995951417004,
"grad_norm": 0.0062059275805950165,
"learning_rate": 9.462866522942468e-05,
"loss": 0.0006,
"step": 720
},
{
"epoch": 5.838056680161944,
"grad_norm": 0.003860118566080928,
"learning_rate": 9.460303922746743e-05,
"loss": 0.0003,
"step": 721
},
{
"epoch": 5.846153846153846,
"grad_norm": 0.008872183039784431,
"learning_rate": 9.457735572842445e-05,
"loss": 0.0005,
"step": 722
},
{
"epoch": 5.854251012145749,
"grad_norm": 0.008847801014780998,
"learning_rate": 9.455161476540394e-05,
"loss": 0.0005,
"step": 723
},
{
"epoch": 5.862348178137652,
"grad_norm": 0.0064693475142121315,
"learning_rate": 9.452581637158819e-05,
"loss": 0.0004,
"step": 724
},
{
"epoch": 5.870445344129554,
"grad_norm": 0.003565672319382429,
"learning_rate": 9.44999605802335e-05,
"loss": 0.0003,
"step": 725
},
{
"epoch": 5.870445344129554,
"eval_loss": 0.0015405402518808842,
"eval_runtime": 20.8981,
"eval_samples_per_second": 4.785,
"eval_steps_per_second": 1.196,
"step": 725
},
{
"epoch": 5.8785425101214575,
"grad_norm": 0.006012369878590107,
"learning_rate": 9.447404742467017e-05,
"loss": 0.0005,
"step": 726
},
{
"epoch": 5.886639676113361,
"grad_norm": 0.01144502218812704,
"learning_rate": 9.444807693830244e-05,
"loss": 0.0006,
"step": 727
},
{
"epoch": 5.894736842105263,
"grad_norm": 0.003242105944082141,
"learning_rate": 9.442204915460847e-05,
"loss": 0.0003,
"step": 728
},
{
"epoch": 5.902834008097166,
"grad_norm": 0.005479468032717705,
"learning_rate": 9.439596410714027e-05,
"loss": 0.0006,
"step": 729
},
{
"epoch": 5.910931174089069,
"grad_norm": 0.007425523828715086,
"learning_rate": 9.436982182952367e-05,
"loss": 0.0006,
"step": 730
},
{
"epoch": 5.919028340080971,
"grad_norm": 0.005463286302983761,
"learning_rate": 9.434362235545827e-05,
"loss": 0.0006,
"step": 731
},
{
"epoch": 5.9271255060728745,
"grad_norm": 0.0037547284737229347,
"learning_rate": 9.431736571871741e-05,
"loss": 0.0004,
"step": 732
},
{
"epoch": 5.935222672064778,
"grad_norm": 0.0055890390649437904,
"learning_rate": 9.429105195314812e-05,
"loss": 0.0003,
"step": 733
},
{
"epoch": 5.94331983805668,
"grad_norm": 0.0038566221483051777,
"learning_rate": 9.426468109267104e-05,
"loss": 0.0002,
"step": 734
},
{
"epoch": 5.951417004048583,
"grad_norm": 0.004965242929756641,
"learning_rate": 9.423825317128045e-05,
"loss": 0.0005,
"step": 735
},
{
"epoch": 5.959514170040486,
"grad_norm": 0.0034646911080926657,
"learning_rate": 9.42117682230442e-05,
"loss": 0.0004,
"step": 736
},
{
"epoch": 5.967611336032388,
"grad_norm": 0.010685238055884838,
"learning_rate": 9.41852262821036e-05,
"loss": 0.0006,
"step": 737
},
{
"epoch": 5.9757085020242915,
"grad_norm": 0.009992929175496101,
"learning_rate": 9.415862738267347e-05,
"loss": 0.0006,
"step": 738
},
{
"epoch": 5.983805668016195,
"grad_norm": 0.0034835604019463062,
"learning_rate": 9.413197155904201e-05,
"loss": 0.0004,
"step": 739
},
{
"epoch": 5.991902834008097,
"grad_norm": 0.006391593255102634,
"learning_rate": 9.410525884557084e-05,
"loss": 0.0006,
"step": 740
},
{
"epoch": 6.0,
"grad_norm": 0.012005936354398727,
"learning_rate": 9.407848927669494e-05,
"loss": 0.0007,
"step": 741
},
{
"epoch": 6.008097165991903,
"grad_norm": 0.005054526962339878,
"learning_rate": 9.405166288692249e-05,
"loss": 0.0003,
"step": 742
},
{
"epoch": 6.016194331983805,
"grad_norm": 0.0041589937172830105,
"learning_rate": 9.402477971083501e-05,
"loss": 0.0004,
"step": 743
},
{
"epoch": 6.0242914979757085,
"grad_norm": 0.004005232825875282,
"learning_rate": 9.399783978308716e-05,
"loss": 0.0003,
"step": 744
},
{
"epoch": 6.032388663967612,
"grad_norm": 0.0026451381854712963,
"learning_rate": 9.39708431384068e-05,
"loss": 0.0003,
"step": 745
},
{
"epoch": 6.040485829959514,
"grad_norm": 0.0035752663388848305,
"learning_rate": 9.39437898115949e-05,
"loss": 0.0003,
"step": 746
},
{
"epoch": 6.048582995951417,
"grad_norm": 0.00357494642958045,
"learning_rate": 9.391667983752545e-05,
"loss": 0.0004,
"step": 747
},
{
"epoch": 6.05668016194332,
"grad_norm": 0.006460043601691723,
"learning_rate": 9.388951325114552e-05,
"loss": 0.0006,
"step": 748
},
{
"epoch": 6.064777327935222,
"grad_norm": 0.004135911352932453,
"learning_rate": 9.386229008747514e-05,
"loss": 0.0003,
"step": 749
},
{
"epoch": 6.0728744939271255,
"grad_norm": 0.0059121702797710896,
"learning_rate": 9.383501038160725e-05,
"loss": 0.0006,
"step": 750
},
{
"epoch": 6.0728744939271255,
"eval_loss": 0.0015712064923718572,
"eval_runtime": 20.9013,
"eval_samples_per_second": 4.784,
"eval_steps_per_second": 1.196,
"step": 750
},
{
"epoch": 6.080971659919029,
"grad_norm": 0.0053249807097017765,
"learning_rate": 9.380767416870768e-05,
"loss": 0.0004,
"step": 751
},
{
"epoch": 6.089068825910931,
"grad_norm": 0.0062401313334703445,
"learning_rate": 9.378028148401516e-05,
"loss": 0.0005,
"step": 752
},
{
"epoch": 6.097165991902834,
"grad_norm": 0.003968521486967802,
"learning_rate": 9.375283236284116e-05,
"loss": 0.0004,
"step": 753
},
{
"epoch": 6.105263157894737,
"grad_norm": 0.0026025085244327784,
"learning_rate": 9.37253268405699e-05,
"loss": 0.0003,
"step": 754
},
{
"epoch": 6.113360323886639,
"grad_norm": 0.004716400057077408,
"learning_rate": 9.369776495265831e-05,
"loss": 0.0003,
"step": 755
},
{
"epoch": 6.1214574898785425,
"grad_norm": 0.006493438966572285,
"learning_rate": 9.367014673463605e-05,
"loss": 0.0003,
"step": 756
},
{
"epoch": 6.129554655870446,
"grad_norm": 0.004002865869551897,
"learning_rate": 9.364247222210529e-05,
"loss": 0.0004,
"step": 757
},
{
"epoch": 6.137651821862348,
"grad_norm": 0.006383996922522783,
"learning_rate": 9.361474145074081e-05,
"loss": 0.0004,
"step": 758
},
{
"epoch": 6.145748987854251,
"grad_norm": 0.008037807419896126,
"learning_rate": 9.358695445628996e-05,
"loss": 0.0004,
"step": 759
},
{
"epoch": 6.153846153846154,
"grad_norm": 0.002863495144993067,
"learning_rate": 9.355911127457247e-05,
"loss": 0.0003,
"step": 760
},
{
"epoch": 6.161943319838056,
"grad_norm": 0.0033732058946043253,
"learning_rate": 9.353121194148058e-05,
"loss": 0.0003,
"step": 761
},
{
"epoch": 6.17004048582996,
"grad_norm": 0.00469600223004818,
"learning_rate": 9.35032564929789e-05,
"loss": 0.0003,
"step": 762
},
{
"epoch": 6.178137651821863,
"grad_norm": 0.0032547153532505035,
"learning_rate": 9.347524496510436e-05,
"loss": 0.0003,
"step": 763
},
{
"epoch": 6.186234817813765,
"grad_norm": 0.003363592317327857,
"learning_rate": 9.344717739396616e-05,
"loss": 0.0003,
"step": 764
},
{
"epoch": 6.194331983805668,
"grad_norm": 0.006892678793519735,
"learning_rate": 9.341905381574579e-05,
"loss": 0.0004,
"step": 765
},
{
"epoch": 6.202429149797571,
"grad_norm": 0.003087045392021537,
"learning_rate": 9.339087426669692e-05,
"loss": 0.0003,
"step": 766
},
{
"epoch": 6.2105263157894735,
"grad_norm": 0.004086201544851065,
"learning_rate": 9.336263878314536e-05,
"loss": 0.0003,
"step": 767
},
{
"epoch": 6.218623481781377,
"grad_norm": 0.007387399207800627,
"learning_rate": 9.333434740148904e-05,
"loss": 0.0004,
"step": 768
},
{
"epoch": 6.22672064777328,
"grad_norm": 0.008816695772111416,
"learning_rate": 9.330600015819795e-05,
"loss": 0.0003,
"step": 769
},
{
"epoch": 6.234817813765182,
"grad_norm": 0.005616582930088043,
"learning_rate": 9.327759708981406e-05,
"loss": 0.0005,
"step": 770
},
{
"epoch": 6.242914979757085,
"grad_norm": 0.0017385140527039766,
"learning_rate": 9.324913823295133e-05,
"loss": 0.0002,
"step": 771
},
{
"epoch": 6.251012145748988,
"grad_norm": 0.005367297679185867,
"learning_rate": 9.322062362429564e-05,
"loss": 0.0003,
"step": 772
},
{
"epoch": 6.2591093117408905,
"grad_norm": 0.00460553914308548,
"learning_rate": 9.319205330060475e-05,
"loss": 0.0003,
"step": 773
},
{
"epoch": 6.267206477732794,
"grad_norm": 0.007978829555213451,
"learning_rate": 9.316342729870818e-05,
"loss": 0.0007,
"step": 774
},
{
"epoch": 6.275303643724697,
"grad_norm": 0.0032822596840560436,
"learning_rate": 9.313474565550729e-05,
"loss": 0.0003,
"step": 775
},
{
"epoch": 6.275303643724697,
"eval_loss": 0.001532125286757946,
"eval_runtime": 20.8909,
"eval_samples_per_second": 4.787,
"eval_steps_per_second": 1.197,
"step": 775
},
{
"epoch": 6.283400809716599,
"grad_norm": 0.00468923756852746,
"learning_rate": 9.310600840797512e-05,
"loss": 0.0003,
"step": 776
},
{
"epoch": 6.291497975708502,
"grad_norm": 0.003942742943763733,
"learning_rate": 9.307721559315644e-05,
"loss": 0.0003,
"step": 777
},
{
"epoch": 6.299595141700405,
"grad_norm": 0.0031825690530240536,
"learning_rate": 9.304836724816758e-05,
"loss": 0.0003,
"step": 778
},
{
"epoch": 6.3076923076923075,
"grad_norm": 0.014810767956078053,
"learning_rate": 9.301946341019653e-05,
"loss": 0.0005,
"step": 779
},
{
"epoch": 6.315789473684211,
"grad_norm": 0.0029204150196164846,
"learning_rate": 9.299050411650276e-05,
"loss": 0.0003,
"step": 780
},
{
"epoch": 6.323886639676114,
"grad_norm": 0.005694466643035412,
"learning_rate": 9.296148940441727e-05,
"loss": 0.0003,
"step": 781
},
{
"epoch": 6.331983805668016,
"grad_norm": 0.004693406168371439,
"learning_rate": 9.293241931134244e-05,
"loss": 0.0003,
"step": 782
},
{
"epoch": 6.340080971659919,
"grad_norm": 0.005331242457032204,
"learning_rate": 9.290329387475212e-05,
"loss": 0.0003,
"step": 783
},
{
"epoch": 6.348178137651822,
"grad_norm": 0.005269868765026331,
"learning_rate": 9.28741131321914e-05,
"loss": 0.0003,
"step": 784
},
{
"epoch": 6.3562753036437245,
"grad_norm": 0.008096984587609768,
"learning_rate": 9.284487712127677e-05,
"loss": 0.0003,
"step": 785
},
{
"epoch": 6.364372469635628,
"grad_norm": 0.005979133769869804,
"learning_rate": 9.281558587969591e-05,
"loss": 0.0003,
"step": 786
},
{
"epoch": 6.372469635627531,
"grad_norm": 0.005570698995143175,
"learning_rate": 9.27862394452077e-05,
"loss": 0.0003,
"step": 787
},
{
"epoch": 6.380566801619433,
"grad_norm": 0.0197993665933609,
"learning_rate": 9.275683785564216e-05,
"loss": 0.0004,
"step": 788
},
{
"epoch": 6.388663967611336,
"grad_norm": 0.0068647717125713825,
"learning_rate": 9.272738114890043e-05,
"loss": 0.0004,
"step": 789
},
{
"epoch": 6.396761133603239,
"grad_norm": 0.005690258927643299,
"learning_rate": 9.269786936295471e-05,
"loss": 0.0003,
"step": 790
},
{
"epoch": 6.4048582995951415,
"grad_norm": 0.003921832423657179,
"learning_rate": 9.266830253584815e-05,
"loss": 0.0004,
"step": 791
},
{
"epoch": 6.412955465587045,
"grad_norm": 0.007321410812437534,
"learning_rate": 9.263868070569494e-05,
"loss": 0.0005,
"step": 792
},
{
"epoch": 6.421052631578947,
"grad_norm": 0.006045639980584383,
"learning_rate": 9.260900391068008e-05,
"loss": 0.0004,
"step": 793
},
{
"epoch": 6.42914979757085,
"grad_norm": 0.006704007275402546,
"learning_rate": 9.257927218905947e-05,
"loss": 0.0005,
"step": 794
},
{
"epoch": 6.437246963562753,
"grad_norm": 0.005516419652849436,
"learning_rate": 9.254948557915983e-05,
"loss": 0.0005,
"step": 795
},
{
"epoch": 6.445344129554655,
"grad_norm": 0.007129244972020388,
"learning_rate": 9.25196441193786e-05,
"loss": 0.0004,
"step": 796
},
{
"epoch": 6.4534412955465585,
"grad_norm": 0.006566017400473356,
"learning_rate": 9.248974784818396e-05,
"loss": 0.0006,
"step": 797
},
{
"epoch": 6.461538461538462,
"grad_norm": 0.004069427493959665,
"learning_rate": 9.245979680411469e-05,
"loss": 0.0003,
"step": 798
},
{
"epoch": 6.469635627530364,
"grad_norm": 0.00994227733463049,
"learning_rate": 9.242979102578027e-05,
"loss": 0.0005,
"step": 799
},
{
"epoch": 6.477732793522267,
"grad_norm": 0.009190657176077366,
"learning_rate": 9.239973055186066e-05,
"loss": 0.0003,
"step": 800
},
{
"epoch": 6.477732793522267,
"eval_loss": 0.001472490606829524,
"eval_runtime": 20.8842,
"eval_samples_per_second": 4.788,
"eval_steps_per_second": 1.197,
"step": 800
},
{
"epoch": 6.48582995951417,
"grad_norm": 0.004918240942060947,
"learning_rate": 9.236961542110634e-05,
"loss": 0.0003,
"step": 801
},
{
"epoch": 6.493927125506072,
"grad_norm": 0.0071716029196977615,
"learning_rate": 9.233944567233825e-05,
"loss": 0.0004,
"step": 802
},
{
"epoch": 6.502024291497976,
"grad_norm": 0.006958463694900274,
"learning_rate": 9.230922134444779e-05,
"loss": 0.0006,
"step": 803
},
{
"epoch": 6.510121457489879,
"grad_norm": 0.004981396719813347,
"learning_rate": 9.227894247639661e-05,
"loss": 0.0005,
"step": 804
},
{
"epoch": 6.518218623481781,
"grad_norm": 0.0036478445399552584,
"learning_rate": 9.224860910721679e-05,
"loss": 0.0003,
"step": 805
},
{
"epoch": 6.526315789473684,
"grad_norm": 0.0037034437991678715,
"learning_rate": 9.221822127601057e-05,
"loss": 0.0003,
"step": 806
},
{
"epoch": 6.534412955465587,
"grad_norm": 0.008089096285402775,
"learning_rate": 9.218777902195043e-05,
"loss": 0.0006,
"step": 807
},
{
"epoch": 6.5425101214574894,
"grad_norm": 0.0034797810949385166,
"learning_rate": 9.215728238427901e-05,
"loss": 0.0004,
"step": 808
},
{
"epoch": 6.550607287449393,
"grad_norm": 0.011531390249729156,
"learning_rate": 9.212673140230907e-05,
"loss": 0.0006,
"step": 809
},
{
"epoch": 6.558704453441296,
"grad_norm": 0.0037022933829575777,
"learning_rate": 9.20961261154234e-05,
"loss": 0.0003,
"step": 810
},
{
"epoch": 6.566801619433198,
"grad_norm": 0.0049828048795461655,
"learning_rate": 9.206546656307478e-05,
"loss": 0.0003,
"step": 811
},
{
"epoch": 6.574898785425101,
"grad_norm": 0.004933022893965244,
"learning_rate": 9.2034752784786e-05,
"loss": 0.0004,
"step": 812
},
{
"epoch": 6.582995951417004,
"grad_norm": 0.0033220879267901182,
"learning_rate": 9.200398482014967e-05,
"loss": 0.0003,
"step": 813
},
{
"epoch": 6.5910931174089065,
"grad_norm": 0.0034737708047032356,
"learning_rate": 9.197316270882833e-05,
"loss": 0.0003,
"step": 814
},
{
"epoch": 6.59919028340081,
"grad_norm": 0.004562221933156252,
"learning_rate": 9.194228649055427e-05,
"loss": 0.0005,
"step": 815
},
{
"epoch": 6.607287449392713,
"grad_norm": 0.004385192412883043,
"learning_rate": 9.191135620512956e-05,
"loss": 0.0005,
"step": 816
},
{
"epoch": 6.615384615384615,
"grad_norm": 0.002965794876217842,
"learning_rate": 9.188037189242593e-05,
"loss": 0.0003,
"step": 817
},
{
"epoch": 6.623481781376518,
"grad_norm": 0.0037224399857223034,
"learning_rate": 9.184933359238479e-05,
"loss": 0.0004,
"step": 818
},
{
"epoch": 6.631578947368421,
"grad_norm": 0.00610083993524313,
"learning_rate": 9.181824134501711e-05,
"loss": 0.0003,
"step": 819
},
{
"epoch": 6.6396761133603235,
"grad_norm": 0.004559030756354332,
"learning_rate": 9.178709519040347e-05,
"loss": 0.0003,
"step": 820
},
{
"epoch": 6.647773279352227,
"grad_norm": 0.004950647708028555,
"learning_rate": 9.175589516869386e-05,
"loss": 0.0003,
"step": 821
},
{
"epoch": 6.65587044534413,
"grad_norm": 0.0062233456410467625,
"learning_rate": 9.172464132010773e-05,
"loss": 0.0006,
"step": 822
},
{
"epoch": 6.663967611336032,
"grad_norm": 0.005544841755181551,
"learning_rate": 9.169333368493396e-05,
"loss": 0.0004,
"step": 823
},
{
"epoch": 6.672064777327935,
"grad_norm": 0.005783478729426861,
"learning_rate": 9.166197230353073e-05,
"loss": 0.0003,
"step": 824
},
{
"epoch": 6.680161943319838,
"grad_norm": 0.004205208271741867,
"learning_rate": 9.163055721632549e-05,
"loss": 0.0003,
"step": 825
},
{
"epoch": 6.680161943319838,
"eval_loss": 0.0014739898033440113,
"eval_runtime": 20.8491,
"eval_samples_per_second": 4.796,
"eval_steps_per_second": 1.199,
"step": 825
},
{
"epoch": 6.6882591093117405,
"grad_norm": 0.002174858469516039,
"learning_rate": 9.159908846381498e-05,
"loss": 0.0002,
"step": 826
},
{
"epoch": 6.696356275303644,
"grad_norm": 0.007229868322610855,
"learning_rate": 9.156756608656506e-05,
"loss": 0.0004,
"step": 827
},
{
"epoch": 6.704453441295547,
"grad_norm": 0.00587072828784585,
"learning_rate": 9.153599012521073e-05,
"loss": 0.0003,
"step": 828
},
{
"epoch": 6.712550607287449,
"grad_norm": 0.00896474625915289,
"learning_rate": 9.150436062045607e-05,
"loss": 0.0003,
"step": 829
},
{
"epoch": 6.720647773279352,
"grad_norm": 0.005071448162198067,
"learning_rate": 9.147267761307421e-05,
"loss": 0.0003,
"step": 830
},
{
"epoch": 6.728744939271255,
"grad_norm": 0.005802792031317949,
"learning_rate": 9.144094114390718e-05,
"loss": 0.0003,
"step": 831
},
{
"epoch": 6.7368421052631575,
"grad_norm": 0.0023914834018796682,
"learning_rate": 9.140915125386602e-05,
"loss": 0.0003,
"step": 832
},
{
"epoch": 6.744939271255061,
"grad_norm": 0.004418396390974522,
"learning_rate": 9.137730798393054e-05,
"loss": 0.0003,
"step": 833
},
{
"epoch": 6.753036437246964,
"grad_norm": 0.006196359172463417,
"learning_rate": 9.134541137514945e-05,
"loss": 0.0003,
"step": 834
},
{
"epoch": 6.761133603238866,
"grad_norm": 0.0021507740020751953,
"learning_rate": 9.131346146864013e-05,
"loss": 0.0002,
"step": 835
},
{
"epoch": 6.769230769230769,
"grad_norm": 0.003959763795137405,
"learning_rate": 9.128145830558872e-05,
"loss": 0.0003,
"step": 836
},
{
"epoch": 6.777327935222672,
"grad_norm": 0.0037787449546158314,
"learning_rate": 9.124940192725002e-05,
"loss": 0.0003,
"step": 837
},
{
"epoch": 6.7854251012145745,
"grad_norm": 0.008174480870366096,
"learning_rate": 9.121729237494738e-05,
"loss": 0.0006,
"step": 838
},
{
"epoch": 6.793522267206478,
"grad_norm": 0.004929924383759499,
"learning_rate": 9.118512969007276e-05,
"loss": 0.0004,
"step": 839
},
{
"epoch": 6.801619433198381,
"grad_norm": 0.004881354980170727,
"learning_rate": 9.115291391408656e-05,
"loss": 0.0004,
"step": 840
},
{
"epoch": 6.809716599190283,
"grad_norm": 0.004849132616072893,
"learning_rate": 9.112064508851763e-05,
"loss": 0.0003,
"step": 841
},
{
"epoch": 6.817813765182186,
"grad_norm": 0.007561651989817619,
"learning_rate": 9.108832325496322e-05,
"loss": 0.0004,
"step": 842
},
{
"epoch": 6.825910931174089,
"grad_norm": 0.005616433918476105,
"learning_rate": 9.105594845508891e-05,
"loss": 0.0003,
"step": 843
},
{
"epoch": 6.834008097165992,
"grad_norm": 0.004294142127037048,
"learning_rate": 9.102352073062854e-05,
"loss": 0.0004,
"step": 844
},
{
"epoch": 6.842105263157895,
"grad_norm": 0.0037746201269328594,
"learning_rate": 9.09910401233842e-05,
"loss": 0.0003,
"step": 845
},
{
"epoch": 6.850202429149798,
"grad_norm": 0.008090085349977016,
"learning_rate": 9.095850667522611e-05,
"loss": 0.001,
"step": 846
},
{
"epoch": 6.8582995951417,
"grad_norm": 0.006233837455511093,
"learning_rate": 9.092592042809267e-05,
"loss": 0.0004,
"step": 847
},
{
"epoch": 6.866396761133603,
"grad_norm": 0.01793971285223961,
"learning_rate": 9.08932814239903e-05,
"loss": 0.0004,
"step": 848
},
{
"epoch": 6.874493927125506,
"grad_norm": 0.008522222749888897,
"learning_rate": 9.086058970499341e-05,
"loss": 0.0003,
"step": 849
},
{
"epoch": 6.882591093117409,
"grad_norm": 0.004106331150978804,
"learning_rate": 9.082784531324437e-05,
"loss": 0.0004,
"step": 850
},
{
"epoch": 6.882591093117409,
"eval_loss": 0.0014429772272706032,
"eval_runtime": 20.9236,
"eval_samples_per_second": 4.779,
"eval_steps_per_second": 1.195,
"step": 850
},
{
"epoch": 6.890688259109312,
"grad_norm": 0.005821602884680033,
"learning_rate": 9.079504829095354e-05,
"loss": 0.0004,
"step": 851
},
{
"epoch": 6.898785425101215,
"grad_norm": 0.0054057384841144085,
"learning_rate": 9.076219868039899e-05,
"loss": 0.0004,
"step": 852
},
{
"epoch": 6.906882591093117,
"grad_norm": 0.0044207195751369,
"learning_rate": 9.072929652392666e-05,
"loss": 0.0004,
"step": 853
},
{
"epoch": 6.91497975708502,
"grad_norm": 0.005526500288397074,
"learning_rate": 9.069634186395022e-05,
"loss": 0.0004,
"step": 854
},
{
"epoch": 6.923076923076923,
"grad_norm": 0.003987747244536877,
"learning_rate": 9.066333474295099e-05,
"loss": 0.0003,
"step": 855
},
{
"epoch": 6.931174089068826,
"grad_norm": 0.006826246622949839,
"learning_rate": 9.063027520347796e-05,
"loss": 0.0004,
"step": 856
},
{
"epoch": 6.939271255060729,
"grad_norm": 0.008676338940858841,
"learning_rate": 9.059716328814765e-05,
"loss": 0.0008,
"step": 857
},
{
"epoch": 6.947368421052632,
"grad_norm": 0.006324645131826401,
"learning_rate": 9.056399903964414e-05,
"loss": 0.0003,
"step": 858
},
{
"epoch": 6.955465587044534,
"grad_norm": 0.008029861375689507,
"learning_rate": 9.053078250071891e-05,
"loss": 0.0005,
"step": 859
},
{
"epoch": 6.963562753036437,
"grad_norm": 0.008121266961097717,
"learning_rate": 9.049751371419093e-05,
"loss": 0.0006,
"step": 860
},
{
"epoch": 6.97165991902834,
"grad_norm": 0.0037181430961936712,
"learning_rate": 9.046419272294644e-05,
"loss": 0.0004,
"step": 861
},
{
"epoch": 6.979757085020243,
"grad_norm": 0.003322604577988386,
"learning_rate": 9.043081956993904e-05,
"loss": 0.0003,
"step": 862
},
{
"epoch": 6.987854251012146,
"grad_norm": 0.003378738649189472,
"learning_rate": 9.039739429818953e-05,
"loss": 0.0003,
"step": 863
},
{
"epoch": 6.995951417004049,
"grad_norm": 0.0044447388499975204,
"learning_rate": 9.036391695078589e-05,
"loss": 0.0004,
"step": 864
},
{
"epoch": 7.004048582995951,
"grad_norm": 0.009173744358122349,
"learning_rate": 9.03303875708833e-05,
"loss": 0.0006,
"step": 865
},
{
"epoch": 7.012145748987854,
"grad_norm": 0.005636727903038263,
"learning_rate": 9.029680620170392e-05,
"loss": 0.0003,
"step": 866
},
{
"epoch": 7.020242914979757,
"grad_norm": 0.007775536272674799,
"learning_rate": 9.026317288653698e-05,
"loss": 0.0003,
"step": 867
},
{
"epoch": 7.02834008097166,
"grad_norm": 0.002120030578225851,
"learning_rate": 9.022948766873868e-05,
"loss": 0.0002,
"step": 868
},
{
"epoch": 7.036437246963563,
"grad_norm": 0.0061560156755149364,
"learning_rate": 9.019575059173209e-05,
"loss": 0.0003,
"step": 869
},
{
"epoch": 7.044534412955466,
"grad_norm": 0.003942039795219898,
"learning_rate": 9.016196169900717e-05,
"loss": 0.0003,
"step": 870
},
{
"epoch": 7.052631578947368,
"grad_norm": 0.005075725261121988,
"learning_rate": 9.012812103412065e-05,
"loss": 0.0003,
"step": 871
},
{
"epoch": 7.060728744939271,
"grad_norm": 0.008946137502789497,
"learning_rate": 9.0094228640696e-05,
"loss": 0.0008,
"step": 872
},
{
"epoch": 7.068825910931174,
"grad_norm": 0.005815677810460329,
"learning_rate": 9.006028456242339e-05,
"loss": 0.0004,
"step": 873
},
{
"epoch": 7.076923076923077,
"grad_norm": 0.0050154379568994045,
"learning_rate": 9.002628884305959e-05,
"loss": 0.0004,
"step": 874
},
{
"epoch": 7.08502024291498,
"grad_norm": 0.00504196947440505,
"learning_rate": 8.999224152642798e-05,
"loss": 0.0003,
"step": 875
},
{
"epoch": 7.08502024291498,
"eval_loss": 0.0014816973125562072,
"eval_runtime": 20.8403,
"eval_samples_per_second": 4.798,
"eval_steps_per_second": 1.2,
"step": 875
},
{
"epoch": 7.093117408906883,
"grad_norm": 0.0035983005072921515,
"learning_rate": 8.995814265641841e-05,
"loss": 0.0004,
"step": 876
},
{
"epoch": 7.101214574898785,
"grad_norm": 0.004920099396258593,
"learning_rate": 8.992399227698721e-05,
"loss": 0.0004,
"step": 877
},
{
"epoch": 7.109311740890688,
"grad_norm": 0.007466362789273262,
"learning_rate": 8.988979043215708e-05,
"loss": 0.0004,
"step": 878
},
{
"epoch": 7.117408906882591,
"grad_norm": 0.0035266538616269827,
"learning_rate": 8.985553716601711e-05,
"loss": 0.0003,
"step": 879
},
{
"epoch": 7.125506072874494,
"grad_norm": 0.0027432111091911793,
"learning_rate": 8.982123252272265e-05,
"loss": 0.0003,
"step": 880
},
{
"epoch": 7.133603238866397,
"grad_norm": 0.005367937497794628,
"learning_rate": 8.97868765464953e-05,
"loss": 0.0004,
"step": 881
},
{
"epoch": 7.1417004048583,
"grad_norm": 0.00508796377107501,
"learning_rate": 8.97524692816228e-05,
"loss": 0.0004,
"step": 882
},
{
"epoch": 7.149797570850202,
"grad_norm": 0.005310658365488052,
"learning_rate": 8.9718010772459e-05,
"loss": 0.0004,
"step": 883
},
{
"epoch": 7.157894736842105,
"grad_norm": 0.003940463997423649,
"learning_rate": 8.968350106342387e-05,
"loss": 0.0003,
"step": 884
},
{
"epoch": 7.165991902834008,
"grad_norm": 0.0020052019972354174,
"learning_rate": 8.964894019900332e-05,
"loss": 0.0002,
"step": 885
},
{
"epoch": 7.174089068825911,
"grad_norm": 0.0013627801090478897,
"learning_rate": 8.961432822374922e-05,
"loss": 0.0002,
"step": 886
},
{
"epoch": 7.182186234817814,
"grad_norm": 0.004774102475494146,
"learning_rate": 8.957966518227934e-05,
"loss": 0.0005,
"step": 887
},
{
"epoch": 7.190283400809717,
"grad_norm": 0.004429248161613941,
"learning_rate": 8.954495111927726e-05,
"loss": 0.0004,
"step": 888
},
{
"epoch": 7.198380566801619,
"grad_norm": 0.0024098637513816357,
"learning_rate": 8.951018607949232e-05,
"loss": 0.0002,
"step": 889
},
{
"epoch": 7.206477732793522,
"grad_norm": 0.004957470111548901,
"learning_rate": 8.947537010773966e-05,
"loss": 0.0004,
"step": 890
},
{
"epoch": 7.2145748987854255,
"grad_norm": 0.0025283123832195997,
"learning_rate": 8.944050324889995e-05,
"loss": 0.0003,
"step": 891
},
{
"epoch": 7.222672064777328,
"grad_norm": 0.00853675790131092,
"learning_rate": 8.940558554791952e-05,
"loss": 0.0003,
"step": 892
},
{
"epoch": 7.230769230769231,
"grad_norm": 0.004788931459188461,
"learning_rate": 8.937061704981026e-05,
"loss": 0.0003,
"step": 893
},
{
"epoch": 7.238866396761134,
"grad_norm": 0.007903149351477623,
"learning_rate": 8.933559779964951e-05,
"loss": 0.0003,
"step": 894
},
{
"epoch": 7.246963562753036,
"grad_norm": 0.01020093634724617,
"learning_rate": 8.930052784258004e-05,
"loss": 0.0003,
"step": 895
},
{
"epoch": 7.255060728744939,
"grad_norm": 0.0071566407568752766,
"learning_rate": 8.926540722380999e-05,
"loss": 0.0003,
"step": 896
},
{
"epoch": 7.2631578947368425,
"grad_norm": 0.011600234545767307,
"learning_rate": 8.92302359886128e-05,
"loss": 0.0006,
"step": 897
},
{
"epoch": 7.271255060728745,
"grad_norm": 0.0032473020255565643,
"learning_rate": 8.919501418232716e-05,
"loss": 0.0003,
"step": 898
},
{
"epoch": 7.279352226720648,
"grad_norm": 0.003534214338287711,
"learning_rate": 8.915974185035696e-05,
"loss": 0.0003,
"step": 899
},
{
"epoch": 7.287449392712551,
"grad_norm": 0.00417256960645318,
"learning_rate": 8.912441903817122e-05,
"loss": 0.0004,
"step": 900
},
{
"epoch": 7.287449392712551,
"eval_loss": 0.001444089226424694,
"eval_runtime": 20.861,
"eval_samples_per_second": 4.794,
"eval_steps_per_second": 1.198,
"step": 900
},
{
"epoch": 7.295546558704453,
"grad_norm": 0.004150230437517166,
"learning_rate": 8.908904579130403e-05,
"loss": 0.0005,
"step": 901
},
{
"epoch": 7.303643724696356,
"grad_norm": 0.0044599175453186035,
"learning_rate": 8.905362215535447e-05,
"loss": 0.0003,
"step": 902
},
{
"epoch": 7.3117408906882595,
"grad_norm": 0.004847770091146231,
"learning_rate": 8.901814817598664e-05,
"loss": 0.0003,
"step": 903
},
{
"epoch": 7.319838056680162,
"grad_norm": 0.006142038386315107,
"learning_rate": 8.898262389892946e-05,
"loss": 0.0003,
"step": 904
},
{
"epoch": 7.327935222672065,
"grad_norm": 0.004584239795804024,
"learning_rate": 8.894704936997674e-05,
"loss": 0.0003,
"step": 905
},
{
"epoch": 7.336032388663968,
"grad_norm": 0.00513102114200592,
"learning_rate": 8.891142463498705e-05,
"loss": 0.0005,
"step": 906
},
{
"epoch": 7.34412955465587,
"grad_norm": 0.003908930346369743,
"learning_rate": 8.887574973988368e-05,
"loss": 0.0003,
"step": 907
},
{
"epoch": 7.352226720647773,
"grad_norm": 0.003959581255912781,
"learning_rate": 8.884002473065459e-05,
"loss": 0.0002,
"step": 908
},
{
"epoch": 7.3603238866396765,
"grad_norm": 0.005572907626628876,
"learning_rate": 8.880424965335234e-05,
"loss": 0.0003,
"step": 909
},
{
"epoch": 7.368421052631579,
"grad_norm": 0.005206345114856958,
"learning_rate": 8.8768424554094e-05,
"loss": 0.0005,
"step": 910
},
{
"epoch": 7.376518218623482,
"grad_norm": 0.003059947630390525,
"learning_rate": 8.87325494790612e-05,
"loss": 0.0002,
"step": 911
},
{
"epoch": 7.384615384615385,
"grad_norm": 0.0039919642731547356,
"learning_rate": 8.86966244744999e-05,
"loss": 0.0003,
"step": 912
},
{
"epoch": 7.392712550607287,
"grad_norm": 0.008040755987167358,
"learning_rate": 8.866064958672047e-05,
"loss": 0.0004,
"step": 913
},
{
"epoch": 7.40080971659919,
"grad_norm": 0.003773482283577323,
"learning_rate": 8.862462486209758e-05,
"loss": 0.0003,
"step": 914
},
{
"epoch": 7.4089068825910935,
"grad_norm": 0.003051398554816842,
"learning_rate": 8.858855034707016e-05,
"loss": 0.0002,
"step": 915
},
{
"epoch": 7.417004048582996,
"grad_norm": 0.012867518700659275,
"learning_rate": 8.855242608814132e-05,
"loss": 0.0003,
"step": 916
},
{
"epoch": 7.425101214574899,
"grad_norm": 0.008691791445016861,
"learning_rate": 8.851625213187823e-05,
"loss": 0.0003,
"step": 917
},
{
"epoch": 7.433198380566802,
"grad_norm": 0.0036156801506876945,
"learning_rate": 8.848002852491222e-05,
"loss": 0.0003,
"step": 918
},
{
"epoch": 7.441295546558704,
"grad_norm": 0.00877736322581768,
"learning_rate": 8.844375531393856e-05,
"loss": 0.0005,
"step": 919
},
{
"epoch": 7.449392712550607,
"grad_norm": 0.006146470084786415,
"learning_rate": 8.840743254571648e-05,
"loss": 0.0003,
"step": 920
},
{
"epoch": 7.4574898785425106,
"grad_norm": 0.005941275041550398,
"learning_rate": 8.837106026706911e-05,
"loss": 0.0003,
"step": 921
},
{
"epoch": 7.465587044534413,
"grad_norm": 0.006193244829773903,
"learning_rate": 8.83346385248834e-05,
"loss": 0.0003,
"step": 922
},
{
"epoch": 7.473684210526316,
"grad_norm": 0.005591843742877245,
"learning_rate": 8.829816736611003e-05,
"loss": 0.0004,
"step": 923
},
{
"epoch": 7.481781376518219,
"grad_norm": 0.004145904444158077,
"learning_rate": 8.82616468377634e-05,
"loss": 0.0002,
"step": 924
},
{
"epoch": 7.489878542510121,
"grad_norm": 0.00641997903585434,
"learning_rate": 8.82250769869216e-05,
"loss": 0.0004,
"step": 925
},
{
"epoch": 7.489878542510121,
"eval_loss": 0.0014737433521077037,
"eval_runtime": 20.8683,
"eval_samples_per_second": 4.792,
"eval_steps_per_second": 1.198,
"step": 925
},
{
"epoch": 7.497975708502024,
"grad_norm": 0.0023201555013656616,
"learning_rate": 8.81884578607262e-05,
"loss": 0.0003,
"step": 926
},
{
"epoch": 7.506072874493928,
"grad_norm": 0.004539423622190952,
"learning_rate": 8.815178950638239e-05,
"loss": 0.0003,
"step": 927
},
{
"epoch": 7.51417004048583,
"grad_norm": 0.005551203154027462,
"learning_rate": 8.811507197115876e-05,
"loss": 0.0003,
"step": 928
},
{
"epoch": 7.522267206477733,
"grad_norm": 0.0017370175337418914,
"learning_rate": 8.80783053023873e-05,
"loss": 0.0002,
"step": 929
},
{
"epoch": 7.530364372469636,
"grad_norm": 0.0032071254681795835,
"learning_rate": 8.804148954746338e-05,
"loss": 0.0003,
"step": 930
},
{
"epoch": 7.538461538461538,
"grad_norm": 0.008781791664659977,
"learning_rate": 8.80046247538456e-05,
"loss": 0.0004,
"step": 931
},
{
"epoch": 7.5465587044534415,
"grad_norm": 0.007886053062975407,
"learning_rate": 8.796771096905581e-05,
"loss": 0.0004,
"step": 932
},
{
"epoch": 7.554655870445345,
"grad_norm": 0.00439440319314599,
"learning_rate": 8.793074824067898e-05,
"loss": 0.0004,
"step": 933
},
{
"epoch": 7.562753036437247,
"grad_norm": 0.006747337989509106,
"learning_rate": 8.789373661636318e-05,
"loss": 0.0004,
"step": 934
},
{
"epoch": 7.57085020242915,
"grad_norm": 0.004344824235886335,
"learning_rate": 8.785667614381956e-05,
"loss": 0.0003,
"step": 935
},
{
"epoch": 7.578947368421053,
"grad_norm": 0.014749690890312195,
"learning_rate": 8.781956687082215e-05,
"loss": 0.0004,
"step": 936
},
{
"epoch": 7.587044534412955,
"grad_norm": 0.006340457126498222,
"learning_rate": 8.778240884520798e-05,
"loss": 0.0006,
"step": 937
},
{
"epoch": 7.5951417004048585,
"grad_norm": 0.0055120596662163734,
"learning_rate": 8.774520211487689e-05,
"loss": 0.0003,
"step": 938
},
{
"epoch": 7.603238866396762,
"grad_norm": 0.0028302748687565327,
"learning_rate": 8.770794672779145e-05,
"loss": 0.0003,
"step": 939
},
{
"epoch": 7.611336032388664,
"grad_norm": 0.006510081235319376,
"learning_rate": 8.767064273197705e-05,
"loss": 0.0002,
"step": 940
},
{
"epoch": 7.619433198380567,
"grad_norm": 0.0060364557430148125,
"learning_rate": 8.763329017552165e-05,
"loss": 0.0004,
"step": 941
},
{
"epoch": 7.62753036437247,
"grad_norm": 0.0069529772736132145,
"learning_rate": 8.759588910657588e-05,
"loss": 0.0005,
"step": 942
},
{
"epoch": 7.635627530364372,
"grad_norm": 0.004733328241854906,
"learning_rate": 8.755843957335287e-05,
"loss": 0.0003,
"step": 943
},
{
"epoch": 7.6437246963562755,
"grad_norm": 0.0061776163056492805,
"learning_rate": 8.752094162412823e-05,
"loss": 0.0004,
"step": 944
},
{
"epoch": 7.651821862348179,
"grad_norm": 0.002511364873498678,
"learning_rate": 8.748339530723999e-05,
"loss": 0.0003,
"step": 945
},
{
"epoch": 7.659919028340081,
"grad_norm": 0.005621533375233412,
"learning_rate": 8.744580067108851e-05,
"loss": 0.0005,
"step": 946
},
{
"epoch": 7.668016194331984,
"grad_norm": 0.004172396846115589,
"learning_rate": 8.740815776413649e-05,
"loss": 0.0002,
"step": 947
},
{
"epoch": 7.676113360323887,
"grad_norm": 0.005457951687276363,
"learning_rate": 8.737046663490877e-05,
"loss": 0.0003,
"step": 948
},
{
"epoch": 7.684210526315789,
"grad_norm": 0.0014876670902594924,
"learning_rate": 8.733272733199241e-05,
"loss": 0.0002,
"step": 949
},
{
"epoch": 7.6923076923076925,
"grad_norm": 0.01848655752837658,
"learning_rate": 8.72949399040366e-05,
"loss": 0.0004,
"step": 950
},
{
"epoch": 7.6923076923076925,
"eval_loss": 0.0013926097890362144,
"eval_runtime": 20.8744,
"eval_samples_per_second": 4.791,
"eval_steps_per_second": 1.198,
"step": 950
},
{
"epoch": 7.700404858299595,
"grad_norm": 0.006439445540308952,
"learning_rate": 8.725710439975247e-05,
"loss": 0.0004,
"step": 951
},
{
"epoch": 7.708502024291498,
"grad_norm": 0.004773348104208708,
"learning_rate": 8.721922086791321e-05,
"loss": 0.0003,
"step": 952
},
{
"epoch": 7.716599190283401,
"grad_norm": 0.00517980707809329,
"learning_rate": 8.71812893573539e-05,
"loss": 0.0004,
"step": 953
},
{
"epoch": 7.724696356275303,
"grad_norm": 0.006808164995163679,
"learning_rate": 8.714330991697144e-05,
"loss": 0.0004,
"step": 954
},
{
"epoch": 7.732793522267206,
"grad_norm": 0.0021944716572761536,
"learning_rate": 8.710528259572456e-05,
"loss": 0.0003,
"step": 955
},
{
"epoch": 7.7408906882591095,
"grad_norm": 0.002964154351502657,
"learning_rate": 8.706720744263368e-05,
"loss": 0.0003,
"step": 956
},
{
"epoch": 7.748987854251012,
"grad_norm": 0.007951617240905762,
"learning_rate": 8.702908450678088e-05,
"loss": 0.0005,
"step": 957
},
{
"epoch": 7.757085020242915,
"grad_norm": 0.007497166749089956,
"learning_rate": 8.699091383730987e-05,
"loss": 0.0006,
"step": 958
},
{
"epoch": 7.765182186234818,
"grad_norm": 0.0034041095059365034,
"learning_rate": 8.695269548342584e-05,
"loss": 0.0003,
"step": 959
},
{
"epoch": 7.77327935222672,
"grad_norm": 0.006926527712494135,
"learning_rate": 8.691442949439548e-05,
"loss": 0.0006,
"step": 960
},
{
"epoch": 7.781376518218623,
"grad_norm": 0.006445242557674646,
"learning_rate": 8.68761159195469e-05,
"loss": 0.0005,
"step": 961
},
{
"epoch": 7.7894736842105265,
"grad_norm": 0.004453368950635195,
"learning_rate": 8.683775480826953e-05,
"loss": 0.0003,
"step": 962
},
{
"epoch": 7.797570850202429,
"grad_norm": 0.005603456404060125,
"learning_rate": 8.679934621001407e-05,
"loss": 0.0003,
"step": 963
},
{
"epoch": 7.805668016194332,
"grad_norm": 0.005167273338884115,
"learning_rate": 8.676089017429246e-05,
"loss": 0.0004,
"step": 964
},
{
"epoch": 7.813765182186235,
"grad_norm": 0.005380601156502962,
"learning_rate": 8.672238675067779e-05,
"loss": 0.0005,
"step": 965
},
{
"epoch": 7.821862348178137,
"grad_norm": 0.008584062568843365,
"learning_rate": 8.668383598880419e-05,
"loss": 0.0004,
"step": 966
},
{
"epoch": 7.82995951417004,
"grad_norm": 0.004554002545773983,
"learning_rate": 8.664523793836688e-05,
"loss": 0.0004,
"step": 967
},
{
"epoch": 7.838056680161944,
"grad_norm": 0.006077317520976067,
"learning_rate": 8.660659264912202e-05,
"loss": 0.0003,
"step": 968
},
{
"epoch": 7.846153846153846,
"grad_norm": 0.005693916697055101,
"learning_rate": 8.656790017088659e-05,
"loss": 0.0003,
"step": 969
},
{
"epoch": 7.854251012145749,
"grad_norm": 0.006276635453104973,
"learning_rate": 8.652916055353852e-05,
"loss": 0.0005,
"step": 970
},
{
"epoch": 7.862348178137652,
"grad_norm": 0.006607879418879747,
"learning_rate": 8.649037384701643e-05,
"loss": 0.0003,
"step": 971
},
{
"epoch": 7.870445344129554,
"grad_norm": 0.005511141382157803,
"learning_rate": 8.645154010131968e-05,
"loss": 0.0004,
"step": 972
},
{
"epoch": 7.8785425101214575,
"grad_norm": 0.004318153951317072,
"learning_rate": 8.641265936650824e-05,
"loss": 0.0003,
"step": 973
},
{
"epoch": 7.886639676113361,
"grad_norm": 0.007693867664784193,
"learning_rate": 8.637373169270264e-05,
"loss": 0.0004,
"step": 974
},
{
"epoch": 7.894736842105263,
"grad_norm": 0.005701543763279915,
"learning_rate": 8.633475713008396e-05,
"loss": 0.0005,
"step": 975
},
{
"epoch": 7.894736842105263,
"eval_loss": 0.0013676926027983427,
"eval_runtime": 20.8821,
"eval_samples_per_second": 4.789,
"eval_steps_per_second": 1.197,
"step": 975
},
{
"epoch": 7.902834008097166,
"grad_norm": 0.003408844815567136,
"learning_rate": 8.62957357288937e-05,
"loss": 0.0002,
"step": 976
},
{
"epoch": 7.910931174089069,
"grad_norm": 0.005576197523623705,
"learning_rate": 8.625666753943375e-05,
"loss": 0.0003,
"step": 977
},
{
"epoch": 7.919028340080971,
"grad_norm": 0.0071185557171702385,
"learning_rate": 8.62175526120663e-05,
"loss": 0.0005,
"step": 978
},
{
"epoch": 7.9271255060728745,
"grad_norm": 0.0053787208162248135,
"learning_rate": 8.617839099721379e-05,
"loss": 0.0004,
"step": 979
},
{
"epoch": 7.935222672064778,
"grad_norm": 0.002727283863350749,
"learning_rate": 8.613918274535884e-05,
"loss": 0.0002,
"step": 980
},
{
"epoch": 7.94331983805668,
"grad_norm": 0.003927669022232294,
"learning_rate": 8.609992790704424e-05,
"loss": 0.0003,
"step": 981
},
{
"epoch": 7.951417004048583,
"grad_norm": 0.0018980104941874743,
"learning_rate": 8.606062653287276e-05,
"loss": 0.0002,
"step": 982
},
{
"epoch": 7.959514170040486,
"grad_norm": 0.004018806852400303,
"learning_rate": 8.60212786735072e-05,
"loss": 0.0004,
"step": 983
},
{
"epoch": 7.967611336032388,
"grad_norm": 0.0028705436270684004,
"learning_rate": 8.598188437967027e-05,
"loss": 0.0003,
"step": 984
},
{
"epoch": 7.9757085020242915,
"grad_norm": 0.0033207128290086985,
"learning_rate": 8.594244370214455e-05,
"loss": 0.0002,
"step": 985
},
{
"epoch": 7.983805668016195,
"grad_norm": 0.0061567616648972034,
"learning_rate": 8.59029566917724e-05,
"loss": 0.0005,
"step": 986
},
{
"epoch": 7.991902834008097,
"grad_norm": 0.005984210874885321,
"learning_rate": 8.58634233994559e-05,
"loss": 0.0005,
"step": 987
},
{
"epoch": 8.0,
"grad_norm": 0.004795044660568237,
"learning_rate": 8.582384387615685e-05,
"loss": 0.0005,
"step": 988
},
{
"epoch": 8.008097165991902,
"grad_norm": 0.003854956943541765,
"learning_rate": 8.578421817289654e-05,
"loss": 0.0004,
"step": 989
},
{
"epoch": 8.016194331983806,
"grad_norm": 0.001711231074295938,
"learning_rate": 8.57445463407559e-05,
"loss": 0.0002,
"step": 990
},
{
"epoch": 8.024291497975709,
"grad_norm": 0.0018254719907417893,
"learning_rate": 8.570482843087524e-05,
"loss": 0.0002,
"step": 991
},
{
"epoch": 8.03238866396761,
"grad_norm": 0.0035706062335520983,
"learning_rate": 8.566506449445432e-05,
"loss": 0.0002,
"step": 992
},
{
"epoch": 8.040485829959515,
"grad_norm": 0.00326129631139338,
"learning_rate": 8.562525458275219e-05,
"loss": 0.0002,
"step": 993
},
{
"epoch": 8.048582995951417,
"grad_norm": 0.004104081075638533,
"learning_rate": 8.558539874708722e-05,
"loss": 0.0003,
"step": 994
},
{
"epoch": 8.05668016194332,
"grad_norm": 0.0026021164376288652,
"learning_rate": 8.554549703883692e-05,
"loss": 0.0003,
"step": 995
},
{
"epoch": 8.064777327935223,
"grad_norm": 0.00469414284452796,
"learning_rate": 8.550554950943798e-05,
"loss": 0.0003,
"step": 996
},
{
"epoch": 8.072874493927126,
"grad_norm": 0.012351175770163536,
"learning_rate": 8.546555621038613e-05,
"loss": 0.0005,
"step": 997
},
{
"epoch": 8.080971659919028,
"grad_norm": 0.005616688635200262,
"learning_rate": 8.542551719323613e-05,
"loss": 0.0003,
"step": 998
},
{
"epoch": 8.089068825910932,
"grad_norm": 0.0016029590042307973,
"learning_rate": 8.538543250960164e-05,
"loss": 0.0002,
"step": 999
},
{
"epoch": 8.097165991902834,
"grad_norm": 0.0028078278992325068,
"learning_rate": 8.534530221115519e-05,
"loss": 0.0003,
"step": 1000
},
{
"epoch": 8.097165991902834,
"eval_loss": 0.001421115593984723,
"eval_runtime": 20.8707,
"eval_samples_per_second": 4.791,
"eval_steps_per_second": 1.198,
"step": 1000
},
{
"epoch": 8.105263157894736,
"grad_norm": 0.004125694278627634,
"learning_rate": 8.530512634962817e-05,
"loss": 0.0003,
"step": 1001
},
{
"epoch": 8.11336032388664,
"grad_norm": 0.0021855218801647425,
"learning_rate": 8.526490497681063e-05,
"loss": 0.0002,
"step": 1002
},
{
"epoch": 8.121457489878543,
"grad_norm": 0.0038041502702981234,
"learning_rate": 8.52246381445513e-05,
"loss": 0.0003,
"step": 1003
},
{
"epoch": 8.129554655870445,
"grad_norm": 0.006479162722826004,
"learning_rate": 8.518432590475756e-05,
"loss": 0.0003,
"step": 1004
},
{
"epoch": 8.137651821862349,
"grad_norm": 0.004671668168157339,
"learning_rate": 8.514396830939528e-05,
"loss": 0.0005,
"step": 1005
},
{
"epoch": 8.145748987854251,
"grad_norm": 0.0065527367405593395,
"learning_rate": 8.51035654104888e-05,
"loss": 0.0005,
"step": 1006
},
{
"epoch": 8.153846153846153,
"grad_norm": 0.0015392835484817624,
"learning_rate": 8.50631172601209e-05,
"loss": 0.0002,
"step": 1007
},
{
"epoch": 8.161943319838057,
"grad_norm": 0.003252090886235237,
"learning_rate": 8.502262391043264e-05,
"loss": 0.0003,
"step": 1008
},
{
"epoch": 8.17004048582996,
"grad_norm": 0.007202859967947006,
"learning_rate": 8.498208541362335e-05,
"loss": 0.0004,
"step": 1009
},
{
"epoch": 8.178137651821862,
"grad_norm": 0.005900178104639053,
"learning_rate": 8.494150182195062e-05,
"loss": 0.0004,
"step": 1010
},
{
"epoch": 8.186234817813766,
"grad_norm": 0.0021305892150849104,
"learning_rate": 8.49008731877301e-05,
"loss": 0.0002,
"step": 1011
},
{
"epoch": 8.194331983805668,
"grad_norm": 0.0073636360466480255,
"learning_rate": 8.486019956333555e-05,
"loss": 0.0003,
"step": 1012
},
{
"epoch": 8.20242914979757,
"grad_norm": 0.006871379911899567,
"learning_rate": 8.48194810011987e-05,
"loss": 0.0006,
"step": 1013
},
{
"epoch": 8.210526315789474,
"grad_norm": 0.004495650064200163,
"learning_rate": 8.47787175538092e-05,
"loss": 0.0003,
"step": 1014
},
{
"epoch": 8.218623481781377,
"grad_norm": 0.008418884128332138,
"learning_rate": 8.47379092737146e-05,
"loss": 0.0004,
"step": 1015
},
{
"epoch": 8.226720647773279,
"grad_norm": 0.0037393553648144007,
"learning_rate": 8.46970562135202e-05,
"loss": 0.0003,
"step": 1016
},
{
"epoch": 8.234817813765183,
"grad_norm": 0.003387110074982047,
"learning_rate": 8.465615842588908e-05,
"loss": 0.0003,
"step": 1017
},
{
"epoch": 8.242914979757085,
"grad_norm": 0.00927880872040987,
"learning_rate": 8.46152159635419e-05,
"loss": 0.0005,
"step": 1018
},
{
"epoch": 8.251012145748987,
"grad_norm": 0.0031711291521787643,
"learning_rate": 8.457422887925698e-05,
"loss": 0.0002,
"step": 1019
},
{
"epoch": 8.259109311740891,
"grad_norm": 0.00468886224552989,
"learning_rate": 8.453319722587014e-05,
"loss": 0.0003,
"step": 1020
},
{
"epoch": 8.267206477732794,
"grad_norm": 0.0016716530080884695,
"learning_rate": 8.449212105627464e-05,
"loss": 0.0002,
"step": 1021
},
{
"epoch": 8.275303643724696,
"grad_norm": 0.005183354951441288,
"learning_rate": 8.445100042342111e-05,
"loss": 0.0002,
"step": 1022
},
{
"epoch": 8.2834008097166,
"grad_norm": 0.006208460312336683,
"learning_rate": 8.440983538031754e-05,
"loss": 0.0005,
"step": 1023
},
{
"epoch": 8.291497975708502,
"grad_norm": 0.005015834234654903,
"learning_rate": 8.436862598002917e-05,
"loss": 0.0003,
"step": 1024
},
{
"epoch": 8.299595141700404,
"grad_norm": 0.00472809886559844,
"learning_rate": 8.432737227567836e-05,
"loss": 0.0003,
"step": 1025
},
{
"epoch": 8.299595141700404,
"eval_loss": 0.0013881891500204802,
"eval_runtime": 20.8543,
"eval_samples_per_second": 4.795,
"eval_steps_per_second": 1.199,
"step": 1025
},
{
"epoch": 8.307692307692308,
"grad_norm": 0.0067214383743703365,
"learning_rate": 8.428607432044464e-05,
"loss": 0.0003,
"step": 1026
},
{
"epoch": 8.31578947368421,
"grad_norm": 0.0032693762332201004,
"learning_rate": 8.424473216756456e-05,
"loss": 0.0002,
"step": 1027
},
{
"epoch": 8.323886639676113,
"grad_norm": 0.003940957598388195,
"learning_rate": 8.420334587033164e-05,
"loss": 0.0002,
"step": 1028
},
{
"epoch": 8.331983805668017,
"grad_norm": 0.0030299958307296038,
"learning_rate": 8.416191548209634e-05,
"loss": 0.0003,
"step": 1029
},
{
"epoch": 8.34008097165992,
"grad_norm": 0.006264000199735165,
"learning_rate": 8.412044105626588e-05,
"loss": 0.0003,
"step": 1030
},
{
"epoch": 8.348178137651821,
"grad_norm": 0.005418987013399601,
"learning_rate": 8.407892264630435e-05,
"loss": 0.0003,
"step": 1031
},
{
"epoch": 8.356275303643725,
"grad_norm": 0.004369835369288921,
"learning_rate": 8.403736030573246e-05,
"loss": 0.0003,
"step": 1032
},
{
"epoch": 8.364372469635628,
"grad_norm": 0.0046693203039467335,
"learning_rate": 8.399575408812759e-05,
"loss": 0.0002,
"step": 1033
},
{
"epoch": 8.37246963562753,
"grad_norm": 0.006310211028903723,
"learning_rate": 8.395410404712366e-05,
"loss": 0.0003,
"step": 1034
},
{
"epoch": 8.380566801619434,
"grad_norm": 0.005021234508603811,
"learning_rate": 8.39124102364111e-05,
"loss": 0.0002,
"step": 1035
},
{
"epoch": 8.388663967611336,
"grad_norm": 0.006567994132637978,
"learning_rate": 8.387067270973676e-05,
"loss": 0.0003,
"step": 1036
},
{
"epoch": 8.396761133603238,
"grad_norm": 0.003345700679346919,
"learning_rate": 8.382889152090382e-05,
"loss": 0.0003,
"step": 1037
},
{
"epoch": 8.404858299595142,
"grad_norm": 0.004254522267729044,
"learning_rate": 8.378706672377177e-05,
"loss": 0.0002,
"step": 1038
},
{
"epoch": 8.412955465587045,
"grad_norm": 0.004765935242176056,
"learning_rate": 8.374519837225632e-05,
"loss": 0.0002,
"step": 1039
},
{
"epoch": 8.421052631578947,
"grad_norm": 0.004489066544920206,
"learning_rate": 8.370328652032928e-05,
"loss": 0.0003,
"step": 1040
},
{
"epoch": 8.429149797570851,
"grad_norm": 0.003664980176836252,
"learning_rate": 8.366133122201861e-05,
"loss": 0.0002,
"step": 1041
},
{
"epoch": 8.437246963562753,
"grad_norm": 0.0038972117472440004,
"learning_rate": 8.361933253140821e-05,
"loss": 0.0003,
"step": 1042
},
{
"epoch": 8.445344129554655,
"grad_norm": 0.006108574103564024,
"learning_rate": 8.357729050263794e-05,
"loss": 0.0003,
"step": 1043
},
{
"epoch": 8.45344129554656,
"grad_norm": 0.005771995056420565,
"learning_rate": 8.353520518990353e-05,
"loss": 0.0003,
"step": 1044
},
{
"epoch": 8.461538461538462,
"grad_norm": 0.0028857016004621983,
"learning_rate": 8.34930766474565e-05,
"loss": 0.0002,
"step": 1045
},
{
"epoch": 8.469635627530364,
"grad_norm": 0.002775567816570401,
"learning_rate": 8.34509049296041e-05,
"loss": 0.0002,
"step": 1046
},
{
"epoch": 8.477732793522268,
"grad_norm": 0.00725373113527894,
"learning_rate": 8.340869009070924e-05,
"loss": 0.0005,
"step": 1047
},
{
"epoch": 8.48582995951417,
"grad_norm": 0.008823963813483715,
"learning_rate": 8.336643218519043e-05,
"loss": 0.0005,
"step": 1048
},
{
"epoch": 8.493927125506072,
"grad_norm": 0.00491480203345418,
"learning_rate": 8.332413126752165e-05,
"loss": 0.0003,
"step": 1049
},
{
"epoch": 8.502024291497976,
"grad_norm": 0.00424035731703043,
"learning_rate": 8.328178739223238e-05,
"loss": 0.0003,
"step": 1050
},
{
"epoch": 8.502024291497976,
"eval_loss": 0.001589785679243505,
"eval_runtime": 20.8531,
"eval_samples_per_second": 4.795,
"eval_steps_per_second": 1.199,
"step": 1050
},
{
"epoch": 8.510121457489879,
"grad_norm": 0.00579440500587225,
"learning_rate": 8.323940061390745e-05,
"loss": 0.0003,
"step": 1051
},
{
"epoch": 8.518218623481781,
"grad_norm": 0.008121831342577934,
"learning_rate": 8.319697098718697e-05,
"loss": 0.0005,
"step": 1052
},
{
"epoch": 8.526315789473685,
"grad_norm": 0.003369817277416587,
"learning_rate": 8.315449856676636e-05,
"loss": 0.0003,
"step": 1053
},
{
"epoch": 8.534412955465587,
"grad_norm": 0.00540441507473588,
"learning_rate": 8.311198340739612e-05,
"loss": 0.0003,
"step": 1054
},
{
"epoch": 8.54251012145749,
"grad_norm": 0.0026687076315283775,
"learning_rate": 8.306942556388189e-05,
"loss": 0.0002,
"step": 1055
},
{
"epoch": 8.550607287449393,
"grad_norm": 0.007054275833070278,
"learning_rate": 8.302682509108435e-05,
"loss": 0.0004,
"step": 1056
},
{
"epoch": 8.558704453441296,
"grad_norm": 0.002858961233869195,
"learning_rate": 8.298418204391907e-05,
"loss": 0.0002,
"step": 1057
},
{
"epoch": 8.566801619433198,
"grad_norm": 0.005906047765165567,
"learning_rate": 8.294149647735659e-05,
"loss": 0.0005,
"step": 1058
},
{
"epoch": 8.574898785425102,
"grad_norm": 0.0035569374449551105,
"learning_rate": 8.289876844642215e-05,
"loss": 0.0003,
"step": 1059
},
{
"epoch": 8.582995951417004,
"grad_norm": 0.004486700054258108,
"learning_rate": 8.285599800619584e-05,
"loss": 0.0003,
"step": 1060
},
{
"epoch": 8.591093117408906,
"grad_norm": 0.003070216393098235,
"learning_rate": 8.281318521181234e-05,
"loss": 0.0003,
"step": 1061
},
{
"epoch": 8.59919028340081,
"grad_norm": 0.0035653486847877502,
"learning_rate": 8.277033011846099e-05,
"loss": 0.0004,
"step": 1062
},
{
"epoch": 8.607287449392713,
"grad_norm": 0.004825572948902845,
"learning_rate": 8.27274327813856e-05,
"loss": 0.0003,
"step": 1063
},
{
"epoch": 8.615384615384615,
"grad_norm": 0.0030901050195097923,
"learning_rate": 8.268449325588447e-05,
"loss": 0.0002,
"step": 1064
},
{
"epoch": 8.623481781376519,
"grad_norm": 0.004419588949531317,
"learning_rate": 8.264151159731029e-05,
"loss": 0.0003,
"step": 1065
},
{
"epoch": 8.631578947368421,
"grad_norm": 0.0031760812271386385,
"learning_rate": 8.259848786107003e-05,
"loss": 0.0003,
"step": 1066
},
{
"epoch": 8.639676113360323,
"grad_norm": 0.009143234230577946,
"learning_rate": 8.25554221026249e-05,
"loss": 0.0004,
"step": 1067
},
{
"epoch": 8.647773279352228,
"grad_norm": 0.0034755307715386152,
"learning_rate": 8.251231437749036e-05,
"loss": 0.0003,
"step": 1068
},
{
"epoch": 8.65587044534413,
"grad_norm": 0.008503805845975876,
"learning_rate": 8.246916474123586e-05,
"loss": 0.0003,
"step": 1069
},
{
"epoch": 8.663967611336032,
"grad_norm": 0.0027896000538021326,
"learning_rate": 8.242597324948496e-05,
"loss": 0.0003,
"step": 1070
},
{
"epoch": 8.672064777327936,
"grad_norm": 0.003082460490986705,
"learning_rate": 8.23827399579151e-05,
"loss": 0.0002,
"step": 1071
},
{
"epoch": 8.680161943319838,
"grad_norm": 0.004392626229673624,
"learning_rate": 8.233946492225769e-05,
"loss": 0.0004,
"step": 1072
},
{
"epoch": 8.68825910931174,
"grad_norm": 0.0028719629626721144,
"learning_rate": 8.229614819829787e-05,
"loss": 0.0002,
"step": 1073
},
{
"epoch": 8.696356275303645,
"grad_norm": 0.00670055765658617,
"learning_rate": 8.225278984187459e-05,
"loss": 0.0004,
"step": 1074
},
{
"epoch": 8.704453441295547,
"grad_norm": 0.0042137037962675095,
"learning_rate": 8.220938990888041e-05,
"loss": 0.0003,
"step": 1075
},
{
"epoch": 8.704453441295547,
"eval_loss": 0.0013522603549063206,
"eval_runtime": 20.8761,
"eval_samples_per_second": 4.79,
"eval_steps_per_second": 1.198,
"step": 1075
},
{
"epoch": 8.712550607287449,
"grad_norm": 0.004317829851061106,
"learning_rate": 8.216594845526154e-05,
"loss": 0.0002,
"step": 1076
},
{
"epoch": 8.720647773279353,
"grad_norm": 0.0006851058569736779,
"learning_rate": 8.212246553701764e-05,
"loss": 0.0002,
"step": 1077
},
{
"epoch": 8.728744939271255,
"grad_norm": 0.003451045835390687,
"learning_rate": 8.207894121020188e-05,
"loss": 0.0002,
"step": 1078
},
{
"epoch": 8.736842105263158,
"grad_norm": 0.004197864327579737,
"learning_rate": 8.203537553092081e-05,
"loss": 0.0003,
"step": 1079
},
{
"epoch": 8.744939271255062,
"grad_norm": 0.003602301701903343,
"learning_rate": 8.199176855533426e-05,
"loss": 0.0002,
"step": 1080
},
{
"epoch": 8.753036437246964,
"grad_norm": 0.004492857493460178,
"learning_rate": 8.194812033965532e-05,
"loss": 0.0004,
"step": 1081
},
{
"epoch": 8.761133603238866,
"grad_norm": 0.006654155440628529,
"learning_rate": 8.190443094015022e-05,
"loss": 0.0004,
"step": 1082
},
{
"epoch": 8.76923076923077,
"grad_norm": 0.0013280883431434631,
"learning_rate": 8.186070041313827e-05,
"loss": 0.0002,
"step": 1083
},
{
"epoch": 8.777327935222672,
"grad_norm": 0.003171891439706087,
"learning_rate": 8.181692881499183e-05,
"loss": 0.0002,
"step": 1084
},
{
"epoch": 8.785425101214575,
"grad_norm": 0.005619837902486324,
"learning_rate": 8.177311620213617e-05,
"loss": 0.0003,
"step": 1085
},
{
"epoch": 8.793522267206479,
"grad_norm": 0.0074705081060528755,
"learning_rate": 8.172926263104949e-05,
"loss": 0.0003,
"step": 1086
},
{
"epoch": 8.80161943319838,
"grad_norm": 0.003815494477748871,
"learning_rate": 8.168536815826271e-05,
"loss": 0.0002,
"step": 1087
},
{
"epoch": 8.809716599190283,
"grad_norm": 0.004843059927225113,
"learning_rate": 8.164143284035953e-05,
"loss": 0.0003,
"step": 1088
},
{
"epoch": 8.817813765182187,
"grad_norm": 0.006158561911433935,
"learning_rate": 8.159745673397628e-05,
"loss": 0.0003,
"step": 1089
},
{
"epoch": 8.82591093117409,
"grad_norm": 0.003568399930372834,
"learning_rate": 8.155343989580187e-05,
"loss": 0.0004,
"step": 1090
},
{
"epoch": 8.834008097165992,
"grad_norm": 0.012608661316335201,
"learning_rate": 8.150938238257773e-05,
"loss": 0.0003,
"step": 1091
},
{
"epoch": 8.842105263157894,
"grad_norm": 0.00646247249096632,
"learning_rate": 8.146528425109772e-05,
"loss": 0.0004,
"step": 1092
},
{
"epoch": 8.850202429149798,
"grad_norm": 0.004845927469432354,
"learning_rate": 8.142114555820807e-05,
"loss": 0.0003,
"step": 1093
},
{
"epoch": 8.8582995951417,
"grad_norm": 0.00976946298032999,
"learning_rate": 8.137696636080725e-05,
"loss": 0.0006,
"step": 1094
},
{
"epoch": 8.866396761133604,
"grad_norm": 0.003078792942687869,
"learning_rate": 8.1332746715846e-05,
"loss": 0.0003,
"step": 1095
},
{
"epoch": 8.874493927125506,
"grad_norm": 0.0021008781623095274,
"learning_rate": 8.12884866803272e-05,
"loss": 0.0003,
"step": 1096
},
{
"epoch": 8.882591093117409,
"grad_norm": 0.010477488860487938,
"learning_rate": 8.124418631130572e-05,
"loss": 0.0003,
"step": 1097
},
{
"epoch": 8.89068825910931,
"grad_norm": 0.004115893505513668,
"learning_rate": 8.119984566588852e-05,
"loss": 0.0003,
"step": 1098
},
{
"epoch": 8.898785425101215,
"grad_norm": 0.007485564332455397,
"learning_rate": 8.115546480123443e-05,
"loss": 0.0003,
"step": 1099
},
{
"epoch": 8.906882591093117,
"grad_norm": 0.006476998329162598,
"learning_rate": 8.111104377455412e-05,
"loss": 0.0003,
"step": 1100
},
{
"epoch": 8.906882591093117,
"eval_loss": 0.001462434884160757,
"eval_runtime": 20.9128,
"eval_samples_per_second": 4.782,
"eval_steps_per_second": 1.195,
"step": 1100
},
{
"epoch": 8.914979757085021,
"grad_norm": 0.003973628859966993,
"learning_rate": 8.106658264311007e-05,
"loss": 0.0002,
"step": 1101
},
{
"epoch": 8.923076923076923,
"grad_norm": 0.00737798260524869,
"learning_rate": 8.102208146421642e-05,
"loss": 0.0003,
"step": 1102
},
{
"epoch": 8.931174089068826,
"grad_norm": 0.005106086377054453,
"learning_rate": 8.097754029523892e-05,
"loss": 0.0003,
"step": 1103
},
{
"epoch": 8.939271255060728,
"grad_norm": 0.0030819557141512632,
"learning_rate": 8.093295919359496e-05,
"loss": 0.0003,
"step": 1104
},
{
"epoch": 8.947368421052632,
"grad_norm": 0.006060482934117317,
"learning_rate": 8.08883382167533e-05,
"loss": 0.0003,
"step": 1105
},
{
"epoch": 8.955465587044534,
"grad_norm": 0.0065495348535478115,
"learning_rate": 8.084367742223418e-05,
"loss": 0.0003,
"step": 1106
},
{
"epoch": 8.963562753036438,
"grad_norm": 0.004741148557513952,
"learning_rate": 8.079897686760911e-05,
"loss": 0.0004,
"step": 1107
},
{
"epoch": 8.97165991902834,
"grad_norm": 0.005379111971706152,
"learning_rate": 8.07542366105009e-05,
"loss": 0.0004,
"step": 1108
},
{
"epoch": 8.979757085020243,
"grad_norm": 0.0065238396637141705,
"learning_rate": 8.070945670858352e-05,
"loss": 0.0002,
"step": 1109
},
{
"epoch": 8.987854251012145,
"grad_norm": 0.003318206174299121,
"learning_rate": 8.066463721958204e-05,
"loss": 0.0003,
"step": 1110
},
{
"epoch": 8.995951417004049,
"grad_norm": 0.006058558821678162,
"learning_rate": 8.061977820127256e-05,
"loss": 0.0004,
"step": 1111
},
{
"epoch": 9.004048582995951,
"grad_norm": 0.008386258967220783,
"learning_rate": 8.057487971148216e-05,
"loss": 0.0004,
"step": 1112
},
{
"epoch": 9.012145748987853,
"grad_norm": 0.0031854738481342793,
"learning_rate": 8.052994180808877e-05,
"loss": 0.0002,
"step": 1113
},
{
"epoch": 9.020242914979757,
"grad_norm": 0.004263192415237427,
"learning_rate": 8.048496454902116e-05,
"loss": 0.0003,
"step": 1114
},
{
"epoch": 9.02834008097166,
"grad_norm": 0.005687515716999769,
"learning_rate": 8.043994799225882e-05,
"loss": 0.0003,
"step": 1115
},
{
"epoch": 9.036437246963562,
"grad_norm": 0.00459116417914629,
"learning_rate": 8.039489219583187e-05,
"loss": 0.0002,
"step": 1116
},
{
"epoch": 9.044534412955466,
"grad_norm": 0.002290780423209071,
"learning_rate": 8.034979721782108e-05,
"loss": 0.0003,
"step": 1117
},
{
"epoch": 9.052631578947368,
"grad_norm": 0.0025161972735077143,
"learning_rate": 8.030466311635762e-05,
"loss": 0.0002,
"step": 1118
},
{
"epoch": 9.06072874493927,
"grad_norm": 0.005804365035146475,
"learning_rate": 8.025948994962322e-05,
"loss": 0.0004,
"step": 1119
},
{
"epoch": 9.068825910931174,
"grad_norm": 0.003750093514099717,
"learning_rate": 8.02142777758499e-05,
"loss": 0.0003,
"step": 1120
},
{
"epoch": 9.076923076923077,
"grad_norm": 0.0025203884579241276,
"learning_rate": 8.016902665331994e-05,
"loss": 0.0002,
"step": 1121
},
{
"epoch": 9.085020242914979,
"grad_norm": 0.008073766715824604,
"learning_rate": 8.01237366403659e-05,
"loss": 0.0005,
"step": 1122
},
{
"epoch": 9.093117408906883,
"grad_norm": 0.005724397487938404,
"learning_rate": 8.007840779537039e-05,
"loss": 0.0003,
"step": 1123
},
{
"epoch": 9.101214574898785,
"grad_norm": 0.005232294090092182,
"learning_rate": 8.003304017676615e-05,
"loss": 0.0003,
"step": 1124
},
{
"epoch": 9.109311740890687,
"grad_norm": 0.006211146246641874,
"learning_rate": 7.998763384303587e-05,
"loss": 0.0003,
"step": 1125
},
{
"epoch": 9.109311740890687,
"eval_loss": 0.001377474400214851,
"eval_runtime": 20.882,
"eval_samples_per_second": 4.789,
"eval_steps_per_second": 1.197,
"step": 1125
},
{
"epoch": 9.117408906882591,
"grad_norm": 0.003781067207455635,
"learning_rate": 7.994218885271214e-05,
"loss": 0.0003,
"step": 1126
},
{
"epoch": 9.125506072874494,
"grad_norm": 0.0043622152879834175,
"learning_rate": 7.98967052643774e-05,
"loss": 0.0002,
"step": 1127
},
{
"epoch": 9.133603238866396,
"grad_norm": 0.0023519883397966623,
"learning_rate": 7.985118313666384e-05,
"loss": 0.0003,
"step": 1128
},
{
"epoch": 9.1417004048583,
"grad_norm": 0.008506176061928272,
"learning_rate": 7.980562252825332e-05,
"loss": 0.0002,
"step": 1129
},
{
"epoch": 9.149797570850202,
"grad_norm": 0.0056921737268567085,
"learning_rate": 7.976002349787732e-05,
"loss": 0.0003,
"step": 1130
},
{
"epoch": 9.157894736842104,
"grad_norm": 0.0015705007826909423,
"learning_rate": 7.971438610431684e-05,
"loss": 0.0002,
"step": 1131
},
{
"epoch": 9.165991902834008,
"grad_norm": 0.0007020162884145975,
"learning_rate": 7.966871040640233e-05,
"loss": 0.0002,
"step": 1132
},
{
"epoch": 9.17408906882591,
"grad_norm": 0.004871773067861795,
"learning_rate": 7.962299646301363e-05,
"loss": 0.0002,
"step": 1133
},
{
"epoch": 9.182186234817813,
"grad_norm": 0.004164962098002434,
"learning_rate": 7.957724433307989e-05,
"loss": 0.0003,
"step": 1134
},
{
"epoch": 9.190283400809717,
"grad_norm": 0.005270279943943024,
"learning_rate": 7.953145407557943e-05,
"loss": 0.0003,
"step": 1135
},
{
"epoch": 9.19838056680162,
"grad_norm": 0.003013407811522484,
"learning_rate": 7.948562574953982e-05,
"loss": 0.0003,
"step": 1136
},
{
"epoch": 9.206477732793521,
"grad_norm": 0.005532771814614534,
"learning_rate": 7.943975941403758e-05,
"loss": 0.0003,
"step": 1137
},
{
"epoch": 9.214574898785425,
"grad_norm": 0.003702058456838131,
"learning_rate": 7.939385512819833e-05,
"loss": 0.0005,
"step": 1138
},
{
"epoch": 9.222672064777328,
"grad_norm": 0.007236842531710863,
"learning_rate": 7.934791295119657e-05,
"loss": 0.0003,
"step": 1139
},
{
"epoch": 9.23076923076923,
"grad_norm": 0.0032832149881869555,
"learning_rate": 7.930193294225563e-05,
"loss": 0.0003,
"step": 1140
},
{
"epoch": 9.238866396761134,
"grad_norm": 0.009601681493222713,
"learning_rate": 7.925591516064763e-05,
"loss": 0.0004,
"step": 1141
},
{
"epoch": 9.246963562753036,
"grad_norm": 0.0053932759910821915,
"learning_rate": 7.920985966569342e-05,
"loss": 0.0005,
"step": 1142
},
{
"epoch": 9.255060728744938,
"grad_norm": 0.0016843380872160196,
"learning_rate": 7.916376651676234e-05,
"loss": 0.0002,
"step": 1143
},
{
"epoch": 9.263157894736842,
"grad_norm": 0.0068874419666826725,
"learning_rate": 7.911763577327243e-05,
"loss": 0.0003,
"step": 1144
},
{
"epoch": 9.271255060728745,
"grad_norm": 0.003599689109250903,
"learning_rate": 7.907146749469007e-05,
"loss": 0.0003,
"step": 1145
},
{
"epoch": 9.279352226720647,
"grad_norm": 0.003752979449927807,
"learning_rate": 7.902526174053011e-05,
"loss": 0.0003,
"step": 1146
},
{
"epoch": 9.287449392712551,
"grad_norm": 0.0033392952755093575,
"learning_rate": 7.897901857035564e-05,
"loss": 0.0003,
"step": 1147
},
{
"epoch": 9.295546558704453,
"grad_norm": 0.001785742468200624,
"learning_rate": 7.893273804377803e-05,
"loss": 0.0002,
"step": 1148
},
{
"epoch": 9.303643724696355,
"grad_norm": 0.0025425672065466642,
"learning_rate": 7.888642022045677e-05,
"loss": 0.0002,
"step": 1149
},
{
"epoch": 9.31174089068826,
"grad_norm": 0.004378551617264748,
"learning_rate": 7.884006516009947e-05,
"loss": 0.0004,
"step": 1150
},
{
"epoch": 9.31174089068826,
"eval_loss": 0.0015521374298259616,
"eval_runtime": 20.8671,
"eval_samples_per_second": 4.792,
"eval_steps_per_second": 1.198,
"step": 1150
},
{
"epoch": 9.319838056680162,
"grad_norm": 0.004232253413647413,
"learning_rate": 7.879367292246169e-05,
"loss": 0.0002,
"step": 1151
},
{
"epoch": 9.327935222672064,
"grad_norm": 0.00398236233741045,
"learning_rate": 7.874724356734698e-05,
"loss": 0.0002,
"step": 1152
},
{
"epoch": 9.336032388663968,
"grad_norm": 0.006320311687886715,
"learning_rate": 7.870077715460666e-05,
"loss": 0.0005,
"step": 1153
},
{
"epoch": 9.34412955465587,
"grad_norm": 0.005346038844436407,
"learning_rate": 7.865427374413991e-05,
"loss": 0.0003,
"step": 1154
},
{
"epoch": 9.352226720647772,
"grad_norm": 0.00476599158719182,
"learning_rate": 7.860773339589351e-05,
"loss": 0.0003,
"step": 1155
},
{
"epoch": 9.360323886639677,
"grad_norm": 0.0034629430156201124,
"learning_rate": 7.856115616986194e-05,
"loss": 0.0003,
"step": 1156
},
{
"epoch": 9.368421052631579,
"grad_norm": 0.0017381705110892653,
"learning_rate": 7.851454212608715e-05,
"loss": 0.0002,
"step": 1157
},
{
"epoch": 9.376518218623481,
"grad_norm": 0.0038830083794891834,
"learning_rate": 7.846789132465858e-05,
"loss": 0.0002,
"step": 1158
},
{
"epoch": 9.384615384615385,
"grad_norm": 0.0028537509497255087,
"learning_rate": 7.842120382571308e-05,
"loss": 0.0003,
"step": 1159
},
{
"epoch": 9.392712550607287,
"grad_norm": 0.0017488327575847507,
"learning_rate": 7.837447968943474e-05,
"loss": 0.0002,
"step": 1160
},
{
"epoch": 9.40080971659919,
"grad_norm": 0.0021484007593244314,
"learning_rate": 7.832771897605496e-05,
"loss": 0.0002,
"step": 1161
},
{
"epoch": 9.408906882591094,
"grad_norm": 0.003066607750952244,
"learning_rate": 7.828092174585221e-05,
"loss": 0.0002,
"step": 1162
},
{
"epoch": 9.417004048582996,
"grad_norm": 0.001146377413533628,
"learning_rate": 7.823408805915212e-05,
"loss": 0.0002,
"step": 1163
},
{
"epoch": 9.425101214574898,
"grad_norm": 0.004038800951093435,
"learning_rate": 7.818721797632724e-05,
"loss": 0.0002,
"step": 1164
},
{
"epoch": 9.433198380566802,
"grad_norm": 0.004361163824796677,
"learning_rate": 7.814031155779708e-05,
"loss": 0.0003,
"step": 1165
},
{
"epoch": 9.441295546558704,
"grad_norm": 0.004912042990326881,
"learning_rate": 7.809336886402796e-05,
"loss": 0.0003,
"step": 1166
},
{
"epoch": 9.449392712550607,
"grad_norm": 0.0021194296423345804,
"learning_rate": 7.804638995553297e-05,
"loss": 0.0003,
"step": 1167
},
{
"epoch": 9.45748987854251,
"grad_norm": 0.004582180175930262,
"learning_rate": 7.799937489287192e-05,
"loss": 0.0003,
"step": 1168
},
{
"epoch": 9.465587044534413,
"grad_norm": 0.0019999807700514793,
"learning_rate": 7.79523237366512e-05,
"loss": 0.0002,
"step": 1169
},
{
"epoch": 9.473684210526315,
"grad_norm": 0.0027365379501134157,
"learning_rate": 7.79052365475237e-05,
"loss": 0.0003,
"step": 1170
},
{
"epoch": 9.481781376518219,
"grad_norm": 0.003132071578875184,
"learning_rate": 7.785811338618878e-05,
"loss": 0.0002,
"step": 1171
},
{
"epoch": 9.489878542510121,
"grad_norm": 0.004992193076759577,
"learning_rate": 7.781095431339221e-05,
"loss": 0.0002,
"step": 1172
},
{
"epoch": 9.497975708502024,
"grad_norm": 0.005324013065546751,
"learning_rate": 7.776375938992599e-05,
"loss": 0.0004,
"step": 1173
},
{
"epoch": 9.506072874493928,
"grad_norm": 0.0014854903565719724,
"learning_rate": 7.771652867662838e-05,
"loss": 0.0002,
"step": 1174
},
{
"epoch": 9.51417004048583,
"grad_norm": 0.0030039751436561346,
"learning_rate": 7.766926223438375e-05,
"loss": 0.0002,
"step": 1175
},
{
"epoch": 9.51417004048583,
"eval_loss": 0.001370253972709179,
"eval_runtime": 20.9183,
"eval_samples_per_second": 4.781,
"eval_steps_per_second": 1.195,
"step": 1175
},
{
"epoch": 9.522267206477732,
"grad_norm": 0.0022788650821894407,
"learning_rate": 7.762196012412255e-05,
"loss": 0.0002,
"step": 1176
},
{
"epoch": 9.530364372469636,
"grad_norm": 0.006257211789488792,
"learning_rate": 7.757462240682119e-05,
"loss": 0.0003,
"step": 1177
},
{
"epoch": 9.538461538461538,
"grad_norm": 0.0036694249138236046,
"learning_rate": 7.752724914350196e-05,
"loss": 0.0003,
"step": 1178
},
{
"epoch": 9.54655870445344,
"grad_norm": 0.0010751505615189672,
"learning_rate": 7.747984039523304e-05,
"loss": 0.0002,
"step": 1179
},
{
"epoch": 9.554655870445345,
"grad_norm": 0.002307455288246274,
"learning_rate": 7.74323962231283e-05,
"loss": 0.0002,
"step": 1180
},
{
"epoch": 9.562753036437247,
"grad_norm": 0.0009123678901232779,
"learning_rate": 7.738491668834726e-05,
"loss": 0.0001,
"step": 1181
},
{
"epoch": 9.570850202429149,
"grad_norm": 0.009321003220975399,
"learning_rate": 7.733740185209508e-05,
"loss": 0.0003,
"step": 1182
},
{
"epoch": 9.578947368421053,
"grad_norm": 0.0018089297227561474,
"learning_rate": 7.728985177562239e-05,
"loss": 0.0002,
"step": 1183
},
{
"epoch": 9.587044534412955,
"grad_norm": 0.004776486661285162,
"learning_rate": 7.724226652022526e-05,
"loss": 0.0003,
"step": 1184
},
{
"epoch": 9.595141700404858,
"grad_norm": 0.0017958278767764568,
"learning_rate": 7.71946461472451e-05,
"loss": 0.0002,
"step": 1185
},
{
"epoch": 9.603238866396762,
"grad_norm": 0.0028115343302488327,
"learning_rate": 7.714699071806859e-05,
"loss": 0.0002,
"step": 1186
},
{
"epoch": 9.611336032388664,
"grad_norm": 0.0017144676530733705,
"learning_rate": 7.709930029412762e-05,
"loss": 0.0002,
"step": 1187
},
{
"epoch": 9.619433198380566,
"grad_norm": 0.0012224303791299462,
"learning_rate": 7.705157493689915e-05,
"loss": 0.0002,
"step": 1188
},
{
"epoch": 9.62753036437247,
"grad_norm": 0.0017139979172497988,
"learning_rate": 7.70038147079052e-05,
"loss": 0.0002,
"step": 1189
},
{
"epoch": 9.635627530364372,
"grad_norm": 0.00275692087598145,
"learning_rate": 7.695601966871277e-05,
"loss": 0.0002,
"step": 1190
},
{
"epoch": 9.643724696356275,
"grad_norm": 0.005676504224538803,
"learning_rate": 7.690818988093367e-05,
"loss": 0.0002,
"step": 1191
},
{
"epoch": 9.651821862348179,
"grad_norm": 0.012121010571718216,
"learning_rate": 7.686032540622457e-05,
"loss": 0.0002,
"step": 1192
},
{
"epoch": 9.65991902834008,
"grad_norm": 0.005054876208305359,
"learning_rate": 7.68124263062868e-05,
"loss": 0.0003,
"step": 1193
},
{
"epoch": 9.668016194331983,
"grad_norm": 0.002249806420877576,
"learning_rate": 7.676449264286633e-05,
"loss": 0.0002,
"step": 1194
},
{
"epoch": 9.676113360323887,
"grad_norm": 0.003644303185865283,
"learning_rate": 7.671652447775374e-05,
"loss": 0.0003,
"step": 1195
},
{
"epoch": 9.68421052631579,
"grad_norm": 0.001162796514108777,
"learning_rate": 7.666852187278402e-05,
"loss": 0.0001,
"step": 1196
},
{
"epoch": 9.692307692307692,
"grad_norm": 0.007950011640787125,
"learning_rate": 7.662048488983658e-05,
"loss": 0.0003,
"step": 1197
},
{
"epoch": 9.700404858299596,
"grad_norm": 0.005151614546775818,
"learning_rate": 7.657241359083518e-05,
"loss": 0.0002,
"step": 1198
},
{
"epoch": 9.708502024291498,
"grad_norm": 0.0012473291717469692,
"learning_rate": 7.652430803774778e-05,
"loss": 0.0002,
"step": 1199
},
{
"epoch": 9.7165991902834,
"grad_norm": 0.006338824518024921,
"learning_rate": 7.647616829258645e-05,
"loss": 0.0005,
"step": 1200
},
{
"epoch": 9.7165991902834,
"eval_loss": 0.001362333190627396,
"eval_runtime": 20.9031,
"eval_samples_per_second": 4.784,
"eval_steps_per_second": 1.196,
"step": 1200
},
{
"epoch": 9.724696356275304,
"grad_norm": 0.0037102538626641035,
"learning_rate": 7.642799441740745e-05,
"loss": 0.0002,
"step": 1201
},
{
"epoch": 9.732793522267206,
"grad_norm": 0.005634430330246687,
"learning_rate": 7.637978647431094e-05,
"loss": 0.0004,
"step": 1202
},
{
"epoch": 9.740890688259109,
"grad_norm": 0.0025536268949508667,
"learning_rate": 7.633154452544105e-05,
"loss": 0.0002,
"step": 1203
},
{
"epoch": 9.748987854251013,
"grad_norm": 0.003189551178365946,
"learning_rate": 7.628326863298573e-05,
"loss": 0.0003,
"step": 1204
},
{
"epoch": 9.757085020242915,
"grad_norm": 0.003228923538699746,
"learning_rate": 7.623495885917666e-05,
"loss": 0.0003,
"step": 1205
},
{
"epoch": 9.765182186234817,
"grad_norm": 0.004319958388805389,
"learning_rate": 7.618661526628926e-05,
"loss": 0.0002,
"step": 1206
},
{
"epoch": 9.773279352226721,
"grad_norm": 0.0017490462632849813,
"learning_rate": 7.613823791664244e-05,
"loss": 0.0002,
"step": 1207
},
{
"epoch": 9.781376518218623,
"grad_norm": 0.0028828205540776253,
"learning_rate": 7.608982687259876e-05,
"loss": 0.0002,
"step": 1208
},
{
"epoch": 9.789473684210526,
"grad_norm": 0.0010670493356883526,
"learning_rate": 7.604138219656411e-05,
"loss": 0.0002,
"step": 1209
},
{
"epoch": 9.79757085020243,
"grad_norm": 0.0030811934266239405,
"learning_rate": 7.599290395098777e-05,
"loss": 0.0004,
"step": 1210
},
{
"epoch": 9.805668016194332,
"grad_norm": 0.0027857047971338034,
"learning_rate": 7.594439219836229e-05,
"loss": 0.0002,
"step": 1211
},
{
"epoch": 9.813765182186234,
"grad_norm": 0.006909377872943878,
"learning_rate": 7.589584700122345e-05,
"loss": 0.0004,
"step": 1212
},
{
"epoch": 9.821862348178138,
"grad_norm": 0.003193581011146307,
"learning_rate": 7.584726842215009e-05,
"loss": 0.0002,
"step": 1213
},
{
"epoch": 9.82995951417004,
"grad_norm": 0.0016280598938465118,
"learning_rate": 7.579865652376407e-05,
"loss": 0.0002,
"step": 1214
},
{
"epoch": 9.838056680161943,
"grad_norm": 0.002516286913305521,
"learning_rate": 7.57500113687303e-05,
"loss": 0.0003,
"step": 1215
},
{
"epoch": 9.846153846153847,
"grad_norm": 0.0036160030867904425,
"learning_rate": 7.570133301975645e-05,
"loss": 0.0003,
"step": 1216
},
{
"epoch": 9.854251012145749,
"grad_norm": 0.006766727659851313,
"learning_rate": 7.565262153959301e-05,
"loss": 0.0002,
"step": 1217
},
{
"epoch": 9.862348178137651,
"grad_norm": 0.0024577074218541384,
"learning_rate": 7.560387699103323e-05,
"loss": 0.0002,
"step": 1218
},
{
"epoch": 9.870445344129555,
"grad_norm": 0.0015718834474682808,
"learning_rate": 7.555509943691296e-05,
"loss": 0.0002,
"step": 1219
},
{
"epoch": 9.878542510121457,
"grad_norm": 0.0027964310720562935,
"learning_rate": 7.550628894011053e-05,
"loss": 0.0002,
"step": 1220
},
{
"epoch": 9.88663967611336,
"grad_norm": 0.007514572236686945,
"learning_rate": 7.545744556354685e-05,
"loss": 0.0004,
"step": 1221
},
{
"epoch": 9.894736842105264,
"grad_norm": 0.009225533343851566,
"learning_rate": 7.540856937018515e-05,
"loss": 0.0003,
"step": 1222
},
{
"epoch": 9.902834008097166,
"grad_norm": 0.004441691096872091,
"learning_rate": 7.535966042303094e-05,
"loss": 0.0003,
"step": 1223
},
{
"epoch": 9.910931174089068,
"grad_norm": 0.0031521234195679426,
"learning_rate": 7.531071878513202e-05,
"loss": 0.0003,
"step": 1224
},
{
"epoch": 9.919028340080972,
"grad_norm": 0.0024993985425680876,
"learning_rate": 7.526174451957827e-05,
"loss": 0.0002,
"step": 1225
},
{
"epoch": 9.919028340080972,
"eval_loss": 0.0013724053278565407,
"eval_runtime": 20.8813,
"eval_samples_per_second": 4.789,
"eval_steps_per_second": 1.197,
"step": 1225
},
{
"epoch": 9.927125506072874,
"grad_norm": 0.0053532798774540424,
"learning_rate": 7.521273768950167e-05,
"loss": 0.0004,
"step": 1226
},
{
"epoch": 9.935222672064777,
"grad_norm": 0.0038510486483573914,
"learning_rate": 7.516369835807615e-05,
"loss": 0.0002,
"step": 1227
},
{
"epoch": 9.94331983805668,
"grad_norm": 0.003887588856741786,
"learning_rate": 7.511462658851759e-05,
"loss": 0.0002,
"step": 1228
},
{
"epoch": 9.951417004048583,
"grad_norm": 0.0037457100115716457,
"learning_rate": 7.50655224440836e-05,
"loss": 0.0004,
"step": 1229
},
{
"epoch": 9.959514170040485,
"grad_norm": 0.006001995410770178,
"learning_rate": 7.501638598807359e-05,
"loss": 0.0002,
"step": 1230
},
{
"epoch": 9.96761133603239,
"grad_norm": 0.0024609274696558714,
"learning_rate": 7.496721728382861e-05,
"loss": 0.0002,
"step": 1231
},
{
"epoch": 9.975708502024291,
"grad_norm": 0.008431760594248772,
"learning_rate": 7.491801639473127e-05,
"loss": 0.0004,
"step": 1232
},
{
"epoch": 9.983805668016194,
"grad_norm": 0.003166408510878682,
"learning_rate": 7.486878338420567e-05,
"loss": 0.0003,
"step": 1233
},
{
"epoch": 9.991902834008098,
"grad_norm": 0.005395290441811085,
"learning_rate": 7.48195183157173e-05,
"loss": 0.0003,
"step": 1234
},
{
"epoch": 10.0,
"grad_norm": 0.0012857104884460568,
"learning_rate": 7.477022125277304e-05,
"loss": 0.0003,
"step": 1235
},
{
"epoch": 10.008097165991902,
"grad_norm": 0.0011820251820608974,
"learning_rate": 7.472089225892093e-05,
"loss": 0.0002,
"step": 1236
},
{
"epoch": 10.016194331983806,
"grad_norm": 0.009494257159531116,
"learning_rate": 7.467153139775022e-05,
"loss": 0.0002,
"step": 1237
},
{
"epoch": 10.024291497975709,
"grad_norm": 0.000511787598952651,
"learning_rate": 7.462213873289123e-05,
"loss": 0.0002,
"step": 1238
},
{
"epoch": 10.03238866396761,
"grad_norm": 0.003136824816465378,
"learning_rate": 7.457271432801531e-05,
"loss": 0.0002,
"step": 1239
},
{
"epoch": 10.040485829959515,
"grad_norm": 0.0016326183686032891,
"learning_rate": 7.452325824683463e-05,
"loss": 0.0002,
"step": 1240
},
{
"epoch": 10.048582995951417,
"grad_norm": 0.001595051260665059,
"learning_rate": 7.447377055310231e-05,
"loss": 0.0002,
"step": 1241
},
{
"epoch": 10.05668016194332,
"grad_norm": 0.0034364748280495405,
"learning_rate": 7.442425131061215e-05,
"loss": 0.0002,
"step": 1242
},
{
"epoch": 10.064777327935223,
"grad_norm": 0.0006874504615552723,
"learning_rate": 7.437470058319865e-05,
"loss": 0.0002,
"step": 1243
},
{
"epoch": 10.072874493927126,
"grad_norm": 0.0012387243332341313,
"learning_rate": 7.432511843473683e-05,
"loss": 0.0002,
"step": 1244
},
{
"epoch": 10.080971659919028,
"grad_norm": 0.005311995279043913,
"learning_rate": 7.427550492914235e-05,
"loss": 0.0004,
"step": 1245
},
{
"epoch": 10.089068825910932,
"grad_norm": 0.007275078445672989,
"learning_rate": 7.422586013037114e-05,
"loss": 0.0002,
"step": 1246
},
{
"epoch": 10.097165991902834,
"grad_norm": 0.0030083160381764174,
"learning_rate": 7.417618410241959e-05,
"loss": 0.0002,
"step": 1247
},
{
"epoch": 10.105263157894736,
"grad_norm": 0.004656744189560413,
"learning_rate": 7.412647690932426e-05,
"loss": 0.0003,
"step": 1248
},
{
"epoch": 10.11336032388664,
"grad_norm": 0.0035645875614136457,
"learning_rate": 7.407673861516195e-05,
"loss": 0.0002,
"step": 1249
},
{
"epoch": 10.121457489878543,
"grad_norm": 0.0017369840061292052,
"learning_rate": 7.402696928404951e-05,
"loss": 0.0002,
"step": 1250
},
{
"epoch": 10.121457489878543,
"eval_loss": 0.0013646668521687388,
"eval_runtime": 20.8332,
"eval_samples_per_second": 4.8,
"eval_steps_per_second": 1.2,
"step": 1250
},
{
"epoch": 10.129554655870445,
"grad_norm": 0.0007923658704385161,
"learning_rate": 7.39771689801438e-05,
"loss": 0.0002,
"step": 1251
},
{
"epoch": 10.137651821862349,
"grad_norm": 0.005313723348081112,
"learning_rate": 7.392733776764164e-05,
"loss": 0.0002,
"step": 1252
},
{
"epoch": 10.145748987854251,
"grad_norm": 0.002177554415538907,
"learning_rate": 7.387747571077966e-05,
"loss": 0.0002,
"step": 1253
},
{
"epoch": 10.153846153846153,
"grad_norm": 0.0029326973017305136,
"learning_rate": 7.382758287383426e-05,
"loss": 0.0002,
"step": 1254
},
{
"epoch": 10.161943319838057,
"grad_norm": 0.001368851400911808,
"learning_rate": 7.377765932112157e-05,
"loss": 0.0002,
"step": 1255
},
{
"epoch": 10.17004048582996,
"grad_norm": 0.003802344435825944,
"learning_rate": 7.372770511699719e-05,
"loss": 0.0003,
"step": 1256
},
{
"epoch": 10.178137651821862,
"grad_norm": 0.0012702594976872206,
"learning_rate": 7.367772032585634e-05,
"loss": 0.0002,
"step": 1257
},
{
"epoch": 10.186234817813766,
"grad_norm": 0.005360572598874569,
"learning_rate": 7.362770501213367e-05,
"loss": 0.0003,
"step": 1258
},
{
"epoch": 10.194331983805668,
"grad_norm": 0.0019447492668405175,
"learning_rate": 7.357765924030311e-05,
"loss": 0.0002,
"step": 1259
},
{
"epoch": 10.20242914979757,
"grad_norm": 0.003229207592085004,
"learning_rate": 7.352758307487788e-05,
"loss": 0.0003,
"step": 1260
},
{
"epoch": 10.210526315789474,
"grad_norm": 0.0026793682482093573,
"learning_rate": 7.347747658041043e-05,
"loss": 0.0002,
"step": 1261
},
{
"epoch": 10.218623481781377,
"grad_norm": 0.0010839339811354876,
"learning_rate": 7.342733982149223e-05,
"loss": 0.0002,
"step": 1262
},
{
"epoch": 10.226720647773279,
"grad_norm": 0.004595485050231218,
"learning_rate": 7.33771728627538e-05,
"loss": 0.0003,
"step": 1263
},
{
"epoch": 10.234817813765183,
"grad_norm": 0.0012352195335552096,
"learning_rate": 7.332697576886462e-05,
"loss": 0.0002,
"step": 1264
},
{
"epoch": 10.242914979757085,
"grad_norm": 0.004273899365216494,
"learning_rate": 7.327674860453296e-05,
"loss": 0.0002,
"step": 1265
},
{
"epoch": 10.251012145748987,
"grad_norm": 0.005454242695122957,
"learning_rate": 7.322649143450585e-05,
"loss": 0.0003,
"step": 1266
},
{
"epoch": 10.259109311740891,
"grad_norm": 0.0013370011001825333,
"learning_rate": 7.317620432356907e-05,
"loss": 0.0002,
"step": 1267
},
{
"epoch": 10.267206477732794,
"grad_norm": 0.00569200748577714,
"learning_rate": 7.312588733654693e-05,
"loss": 0.0002,
"step": 1268
},
{
"epoch": 10.275303643724696,
"grad_norm": 0.0030299886129796505,
"learning_rate": 7.307554053830232e-05,
"loss": 0.0002,
"step": 1269
},
{
"epoch": 10.2834008097166,
"grad_norm": 0.0035660641733556986,
"learning_rate": 7.302516399373645e-05,
"loss": 0.0002,
"step": 1270
},
{
"epoch": 10.291497975708502,
"grad_norm": 0.0019571082666516304,
"learning_rate": 7.2974757767789e-05,
"loss": 0.0002,
"step": 1271
},
{
"epoch": 10.299595141700404,
"grad_norm": 0.0020669170189648867,
"learning_rate": 7.292432192543783e-05,
"loss": 0.0002,
"step": 1272
},
{
"epoch": 10.307692307692308,
"grad_norm": 0.0015401261625811458,
"learning_rate": 7.287385653169898e-05,
"loss": 0.0002,
"step": 1273
},
{
"epoch": 10.31578947368421,
"grad_norm": 0.0020590811036527157,
"learning_rate": 7.282336165162665e-05,
"loss": 0.0003,
"step": 1274
},
{
"epoch": 10.323886639676113,
"grad_norm": 0.0030925278551876545,
"learning_rate": 7.277283735031298e-05,
"loss": 0.0002,
"step": 1275
},
{
"epoch": 10.323886639676113,
"eval_loss": 0.0014393809251487255,
"eval_runtime": 20.8439,
"eval_samples_per_second": 4.798,
"eval_steps_per_second": 1.199,
"step": 1275
},
{
"epoch": 10.331983805668017,
"grad_norm": 0.002777695655822754,
"learning_rate": 7.272228369288806e-05,
"loss": 0.0002,
"step": 1276
},
{
"epoch": 10.34008097165992,
"grad_norm": 0.0012481295270845294,
"learning_rate": 7.267170074451983e-05,
"loss": 0.0002,
"step": 1277
},
{
"epoch": 10.348178137651821,
"grad_norm": 0.0015662169316783547,
"learning_rate": 7.262108857041399e-05,
"loss": 0.0002,
"step": 1278
},
{
"epoch": 10.356275303643725,
"grad_norm": 0.003938265610486269,
"learning_rate": 7.257044723581391e-05,
"loss": 0.0002,
"step": 1279
},
{
"epoch": 10.364372469635628,
"grad_norm": 0.0015693637542426586,
"learning_rate": 7.251977680600053e-05,
"loss": 0.0002,
"step": 1280
},
{
"epoch": 10.37246963562753,
"grad_norm": 0.002020388375967741,
"learning_rate": 7.246907734629233e-05,
"loss": 0.0002,
"step": 1281
},
{
"epoch": 10.380566801619434,
"grad_norm": 0.00170166976749897,
"learning_rate": 7.24183489220452e-05,
"loss": 0.0002,
"step": 1282
},
{
"epoch": 10.388663967611336,
"grad_norm": 0.002372957533225417,
"learning_rate": 7.236759159865236e-05,
"loss": 0.0002,
"step": 1283
},
{
"epoch": 10.396761133603238,
"grad_norm": 0.004695139825344086,
"learning_rate": 7.231680544154427e-05,
"loss": 0.0002,
"step": 1284
},
{
"epoch": 10.404858299595142,
"grad_norm": 0.002594963414594531,
"learning_rate": 7.226599051618863e-05,
"loss": 0.0002,
"step": 1285
},
{
"epoch": 10.412955465587045,
"grad_norm": 0.0015460449503734708,
"learning_rate": 7.22151468880901e-05,
"loss": 0.0002,
"step": 1286
},
{
"epoch": 10.421052631578947,
"grad_norm": 0.005851665511727333,
"learning_rate": 7.216427462279047e-05,
"loss": 0.0002,
"step": 1287
},
{
"epoch": 10.429149797570851,
"grad_norm": 0.0015737306093797088,
"learning_rate": 7.211337378586835e-05,
"loss": 0.0002,
"step": 1288
},
{
"epoch": 10.437246963562753,
"grad_norm": 0.003812261624261737,
"learning_rate": 7.206244444293925e-05,
"loss": 0.0002,
"step": 1289
},
{
"epoch": 10.445344129554655,
"grad_norm": 0.0015756061766296625,
"learning_rate": 7.201148665965536e-05,
"loss": 0.0002,
"step": 1290
},
{
"epoch": 10.45344129554656,
"grad_norm": 0.005322249606251717,
"learning_rate": 7.196050050170561e-05,
"loss": 0.0002,
"step": 1291
},
{
"epoch": 10.461538461538462,
"grad_norm": 0.0012198768090456724,
"learning_rate": 7.190948603481543e-05,
"loss": 0.0002,
"step": 1292
},
{
"epoch": 10.469635627530364,
"grad_norm": 0.002762486459687352,
"learning_rate": 7.185844332474679e-05,
"loss": 0.0003,
"step": 1293
},
{
"epoch": 10.477732793522268,
"grad_norm": 0.0013679059920832515,
"learning_rate": 7.180737243729804e-05,
"loss": 0.0002,
"step": 1294
},
{
"epoch": 10.48582995951417,
"grad_norm": 0.0009709448786452413,
"learning_rate": 7.175627343830392e-05,
"loss": 0.0002,
"step": 1295
},
{
"epoch": 10.493927125506072,
"grad_norm": 0.0033552187960594893,
"learning_rate": 7.17051463936353e-05,
"loss": 0.0003,
"step": 1296
},
{
"epoch": 10.502024291497976,
"grad_norm": 0.001450160052627325,
"learning_rate": 7.16539913691993e-05,
"loss": 0.0002,
"step": 1297
},
{
"epoch": 10.510121457489879,
"grad_norm": 0.0011616905685514212,
"learning_rate": 7.160280843093902e-05,
"loss": 0.0002,
"step": 1298
},
{
"epoch": 10.518218623481781,
"grad_norm": 0.0014929898316040635,
"learning_rate": 7.155159764483364e-05,
"loss": 0.0002,
"step": 1299
},
{
"epoch": 10.526315789473685,
"grad_norm": 0.0010820671450346708,
"learning_rate": 7.150035907689816e-05,
"loss": 0.0002,
"step": 1300
},
{
"epoch": 10.526315789473685,
"eval_loss": 0.0013635024661198258,
"eval_runtime": 20.8752,
"eval_samples_per_second": 4.79,
"eval_steps_per_second": 1.198,
"step": 1300
},
{
"epoch": 10.534412955465587,
"grad_norm": 0.002717435359954834,
"learning_rate": 7.144909279318344e-05,
"loss": 0.0002,
"step": 1301
},
{
"epoch": 10.54251012145749,
"grad_norm": 0.004127759486436844,
"learning_rate": 7.139779885977604e-05,
"loss": 0.0003,
"step": 1302
},
{
"epoch": 10.550607287449393,
"grad_norm": 0.0019464613869786263,
"learning_rate": 7.134647734279817e-05,
"loss": 0.0002,
"step": 1303
},
{
"epoch": 10.558704453441296,
"grad_norm": 0.002114477101713419,
"learning_rate": 7.129512830840763e-05,
"loss": 0.0002,
"step": 1304
},
{
"epoch": 10.566801619433198,
"grad_norm": 0.0025543624069541693,
"learning_rate": 7.124375182279762e-05,
"loss": 0.0003,
"step": 1305
},
{
"epoch": 10.574898785425102,
"grad_norm": 0.0009003437007777393,
"learning_rate": 7.11923479521968e-05,
"loss": 0.0002,
"step": 1306
},
{
"epoch": 10.582995951417004,
"grad_norm": 0.002608225215226412,
"learning_rate": 7.11409167628691e-05,
"loss": 0.0002,
"step": 1307
},
{
"epoch": 10.591093117408906,
"grad_norm": 0.004385102540254593,
"learning_rate": 7.108945832111366e-05,
"loss": 0.0003,
"step": 1308
},
{
"epoch": 10.59919028340081,
"grad_norm": 0.002555650193244219,
"learning_rate": 7.103797269326475e-05,
"loss": 0.0002,
"step": 1309
},
{
"epoch": 10.607287449392713,
"grad_norm": 0.00154935906175524,
"learning_rate": 7.098645994569171e-05,
"loss": 0.0002,
"step": 1310
},
{
"epoch": 10.615384615384615,
"grad_norm": 0.0027549327351152897,
"learning_rate": 7.093492014479884e-05,
"loss": 0.0002,
"step": 1311
},
{
"epoch": 10.623481781376519,
"grad_norm": 0.0012162472121417522,
"learning_rate": 7.088335335702525e-05,
"loss": 0.0002,
"step": 1312
},
{
"epoch": 10.631578947368421,
"grad_norm": 0.0018456067191436887,
"learning_rate": 7.083175964884491e-05,
"loss": 0.0002,
"step": 1313
},
{
"epoch": 10.639676113360323,
"grad_norm": 0.0008041391847655177,
"learning_rate": 7.078013908676649e-05,
"loss": 0.0002,
"step": 1314
},
{
"epoch": 10.647773279352228,
"grad_norm": 0.0049199191853404045,
"learning_rate": 7.072849173733323e-05,
"loss": 0.0002,
"step": 1315
},
{
"epoch": 10.65587044534413,
"grad_norm": 0.0091862166300416,
"learning_rate": 7.067681766712293e-05,
"loss": 0.0004,
"step": 1316
},
{
"epoch": 10.663967611336032,
"grad_norm": 0.0005279682809486985,
"learning_rate": 7.062511694274783e-05,
"loss": 0.0002,
"step": 1317
},
{
"epoch": 10.672064777327936,
"grad_norm": 0.003007502295076847,
"learning_rate": 7.057338963085453e-05,
"loss": 0.0003,
"step": 1318
},
{
"epoch": 10.680161943319838,
"grad_norm": 0.0021479730494320393,
"learning_rate": 7.052163579812393e-05,
"loss": 0.0002,
"step": 1319
},
{
"epoch": 10.68825910931174,
"grad_norm": 0.0024797257501631975,
"learning_rate": 7.046985551127106e-05,
"loss": 0.0002,
"step": 1320
},
{
"epoch": 10.696356275303645,
"grad_norm": 0.0019876586738973856,
"learning_rate": 7.04180488370451e-05,
"loss": 0.0002,
"step": 1321
},
{
"epoch": 10.704453441295547,
"grad_norm": 0.002082040999084711,
"learning_rate": 7.036621584222925e-05,
"loss": 0.0002,
"step": 1322
},
{
"epoch": 10.712550607287449,
"grad_norm": 0.0060581970028579235,
"learning_rate": 7.031435659364057e-05,
"loss": 0.0004,
"step": 1323
},
{
"epoch": 10.720647773279353,
"grad_norm": 0.0017555778613314033,
"learning_rate": 7.026247115813003e-05,
"loss": 0.0002,
"step": 1324
},
{
"epoch": 10.728744939271255,
"grad_norm": 0.003592435270547867,
"learning_rate": 7.021055960258239e-05,
"loss": 0.0002,
"step": 1325
},
{
"epoch": 10.728744939271255,
"eval_loss": 0.0015407928731292486,
"eval_runtime": 20.8476,
"eval_samples_per_second": 4.797,
"eval_steps_per_second": 1.199,
"step": 1325
},
{
"epoch": 10.728744939271255,
"step": 1325,
"total_flos": 3.261064452032889e+18,
"train_loss": 0.0019915386885712098,
"train_runtime": 25159.8869,
"train_samples_per_second": 3.927,
"train_steps_per_second": 0.122
}
],
"logging_steps": 1,
"max_steps": 3075,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 4
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.261064452032889e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}