fine-tuned-llama-13b-acad-2 / trainer_state.json
ewsiegel's picture
overfit llama 13b model for acad data
c6880d6
raw
history blame
100 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.743034055727556,
"eval_steps": 500,
"global_step": 6700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030959752321981424,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 2.397,
"step": 10
},
{
"epoch": 0.06191950464396285,
"grad_norm": 0.69140625,
"learning_rate": 0.0001,
"loss": 2.1975,
"step": 20
},
{
"epoch": 0.09287925696594428,
"grad_norm": 0.765625,
"learning_rate": 0.0001,
"loss": 2.1624,
"step": 30
},
{
"epoch": 0.1238390092879257,
"grad_norm": 0.97265625,
"learning_rate": 0.0001,
"loss": 2.0724,
"step": 40
},
{
"epoch": 0.15479876160990713,
"grad_norm": 1.7734375,
"learning_rate": 0.0001,
"loss": 2.0248,
"step": 50
},
{
"epoch": 0.18575851393188855,
"grad_norm": 0.421875,
"learning_rate": 0.0001,
"loss": 2.2899,
"step": 60
},
{
"epoch": 0.21671826625386997,
"grad_norm": 0.73046875,
"learning_rate": 0.0001,
"loss": 2.1192,
"step": 70
},
{
"epoch": 0.2476780185758514,
"grad_norm": 0.734375,
"learning_rate": 0.0001,
"loss": 2.0941,
"step": 80
},
{
"epoch": 0.2786377708978328,
"grad_norm": 1.03125,
"learning_rate": 0.0001,
"loss": 2.0773,
"step": 90
},
{
"epoch": 0.30959752321981426,
"grad_norm": 1.40625,
"learning_rate": 0.0001,
"loss": 1.9767,
"step": 100
},
{
"epoch": 0.34055727554179566,
"grad_norm": 0.43359375,
"learning_rate": 0.0001,
"loss": 2.1887,
"step": 110
},
{
"epoch": 0.3715170278637771,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 2.0403,
"step": 120
},
{
"epoch": 0.4024767801857585,
"grad_norm": 0.7578125,
"learning_rate": 0.0001,
"loss": 2.0129,
"step": 130
},
{
"epoch": 0.43343653250773995,
"grad_norm": 0.8515625,
"learning_rate": 0.0001,
"loss": 2.0155,
"step": 140
},
{
"epoch": 0.46439628482972134,
"grad_norm": 1.6796875,
"learning_rate": 0.0001,
"loss": 2.0159,
"step": 150
},
{
"epoch": 0.4953560371517028,
"grad_norm": 0.478515625,
"learning_rate": 0.0001,
"loss": 2.1815,
"step": 160
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.66015625,
"learning_rate": 0.0001,
"loss": 2.0064,
"step": 170
},
{
"epoch": 0.5572755417956656,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 2.03,
"step": 180
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.84765625,
"learning_rate": 0.0001,
"loss": 2.0138,
"step": 190
},
{
"epoch": 0.6191950464396285,
"grad_norm": 1.53125,
"learning_rate": 0.0001,
"loss": 1.9765,
"step": 200
},
{
"epoch": 0.6501547987616099,
"grad_norm": 0.46875,
"learning_rate": 0.0001,
"loss": 2.2076,
"step": 210
},
{
"epoch": 0.6811145510835913,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 2.0659,
"step": 220
},
{
"epoch": 0.7120743034055728,
"grad_norm": 0.7734375,
"learning_rate": 0.0001,
"loss": 2.0689,
"step": 230
},
{
"epoch": 0.7430340557275542,
"grad_norm": 0.83203125,
"learning_rate": 0.0001,
"loss": 1.8855,
"step": 240
},
{
"epoch": 0.7739938080495357,
"grad_norm": 1.3125,
"learning_rate": 0.0001,
"loss": 1.812,
"step": 250
},
{
"epoch": 0.804953560371517,
"grad_norm": 0.50390625,
"learning_rate": 0.0001,
"loss": 2.1327,
"step": 260
},
{
"epoch": 0.8359133126934984,
"grad_norm": 0.7109375,
"learning_rate": 0.0001,
"loss": 2.0221,
"step": 270
},
{
"epoch": 0.8668730650154799,
"grad_norm": 0.73828125,
"learning_rate": 0.0001,
"loss": 1.9692,
"step": 280
},
{
"epoch": 0.8978328173374613,
"grad_norm": 0.9453125,
"learning_rate": 0.0001,
"loss": 1.8887,
"step": 290
},
{
"epoch": 0.9287925696594427,
"grad_norm": 1.453125,
"learning_rate": 0.0001,
"loss": 1.8405,
"step": 300
},
{
"epoch": 0.9597523219814241,
"grad_norm": 0.65234375,
"learning_rate": 0.0001,
"loss": 2.0955,
"step": 310
},
{
"epoch": 0.9907120743034056,
"grad_norm": 1.0234375,
"learning_rate": 0.0001,
"loss": 1.9367,
"step": 320
},
{
"epoch": 1.021671826625387,
"grad_norm": 0.53515625,
"learning_rate": 0.0001,
"loss": 2.0857,
"step": 330
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.8077,
"step": 340
},
{
"epoch": 1.08359133126935,
"grad_norm": 0.78515625,
"learning_rate": 0.0001,
"loss": 1.5881,
"step": 350
},
{
"epoch": 1.1145510835913313,
"grad_norm": 1.2265625,
"learning_rate": 0.0001,
"loss": 1.5215,
"step": 360
},
{
"epoch": 1.1455108359133126,
"grad_norm": 1.6328125,
"learning_rate": 0.0001,
"loss": 1.3684,
"step": 370
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 1.7801,
"step": 380
},
{
"epoch": 1.2074303405572755,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 1.7372,
"step": 390
},
{
"epoch": 1.238390092879257,
"grad_norm": 0.91015625,
"learning_rate": 0.0001,
"loss": 1.6767,
"step": 400
},
{
"epoch": 1.2693498452012384,
"grad_norm": 1.15625,
"learning_rate": 0.0001,
"loss": 1.4923,
"step": 410
},
{
"epoch": 1.3003095975232197,
"grad_norm": 1.34375,
"learning_rate": 0.0001,
"loss": 1.2514,
"step": 420
},
{
"epoch": 1.3312693498452013,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 1.7803,
"step": 430
},
{
"epoch": 1.3622291021671826,
"grad_norm": 0.7421875,
"learning_rate": 0.0001,
"loss": 1.7424,
"step": 440
},
{
"epoch": 1.3931888544891642,
"grad_norm": 0.796875,
"learning_rate": 0.0001,
"loss": 1.6923,
"step": 450
},
{
"epoch": 1.4241486068111455,
"grad_norm": 1.15625,
"learning_rate": 0.0001,
"loss": 1.4639,
"step": 460
},
{
"epoch": 1.4551083591331269,
"grad_norm": 1.5234375,
"learning_rate": 0.0001,
"loss": 1.4187,
"step": 470
},
{
"epoch": 1.4860681114551084,
"grad_norm": 0.78125,
"learning_rate": 0.0001,
"loss": 1.7615,
"step": 480
},
{
"epoch": 1.5170278637770898,
"grad_norm": 0.68359375,
"learning_rate": 0.0001,
"loss": 1.8067,
"step": 490
},
{
"epoch": 1.5479876160990713,
"grad_norm": 0.96484375,
"learning_rate": 0.0001,
"loss": 1.6603,
"step": 500
},
{
"epoch": 1.5789473684210527,
"grad_norm": 1.3046875,
"learning_rate": 0.0001,
"loss": 1.4714,
"step": 510
},
{
"epoch": 1.609907120743034,
"grad_norm": 1.53125,
"learning_rate": 0.0001,
"loss": 1.3003,
"step": 520
},
{
"epoch": 1.6408668730650153,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 1.783,
"step": 530
},
{
"epoch": 1.671826625386997,
"grad_norm": 0.73046875,
"learning_rate": 0.0001,
"loss": 1.8179,
"step": 540
},
{
"epoch": 1.7027863777089784,
"grad_norm": 1.0,
"learning_rate": 0.0001,
"loss": 1.6297,
"step": 550
},
{
"epoch": 1.7337461300309598,
"grad_norm": 1.109375,
"learning_rate": 0.0001,
"loss": 1.5301,
"step": 560
},
{
"epoch": 1.7647058823529411,
"grad_norm": 1.5625,
"learning_rate": 0.0001,
"loss": 1.3402,
"step": 570
},
{
"epoch": 1.7956656346749225,
"grad_norm": 0.734375,
"learning_rate": 0.0001,
"loss": 1.83,
"step": 580
},
{
"epoch": 1.826625386996904,
"grad_norm": 0.73828125,
"learning_rate": 0.0001,
"loss": 1.774,
"step": 590
},
{
"epoch": 1.8575851393188856,
"grad_norm": 0.93359375,
"learning_rate": 0.0001,
"loss": 1.6684,
"step": 600
},
{
"epoch": 1.888544891640867,
"grad_norm": 1.1875,
"learning_rate": 0.0001,
"loss": 1.5126,
"step": 610
},
{
"epoch": 1.9195046439628483,
"grad_norm": 1.3671875,
"learning_rate": 0.0001,
"loss": 1.3526,
"step": 620
},
{
"epoch": 1.9504643962848296,
"grad_norm": 0.94140625,
"learning_rate": 0.0001,
"loss": 1.6972,
"step": 630
},
{
"epoch": 1.9814241486068112,
"grad_norm": 1.03125,
"learning_rate": 0.0001,
"loss": 1.5535,
"step": 640
},
{
"epoch": 2.0123839009287927,
"grad_norm": 0.50390625,
"learning_rate": 0.0001,
"loss": 1.5312,
"step": 650
},
{
"epoch": 2.043343653250774,
"grad_norm": 0.85546875,
"learning_rate": 0.0001,
"loss": 1.4688,
"step": 660
},
{
"epoch": 2.0743034055727554,
"grad_norm": 1.0703125,
"learning_rate": 0.0001,
"loss": 1.1771,
"step": 670
},
{
"epoch": 2.1052631578947367,
"grad_norm": 1.828125,
"learning_rate": 0.0001,
"loss": 0.9073,
"step": 680
},
{
"epoch": 2.136222910216718,
"grad_norm": 1.875,
"learning_rate": 0.0001,
"loss": 0.7219,
"step": 690
},
{
"epoch": 2.1671826625387,
"grad_norm": 0.9921875,
"learning_rate": 0.0001,
"loss": 1.0551,
"step": 700
},
{
"epoch": 2.198142414860681,
"grad_norm": 0.92578125,
"learning_rate": 0.0001,
"loss": 1.5082,
"step": 710
},
{
"epoch": 2.2291021671826625,
"grad_norm": 1.1015625,
"learning_rate": 0.0001,
"loss": 1.1655,
"step": 720
},
{
"epoch": 2.260061919504644,
"grad_norm": 1.578125,
"learning_rate": 0.0001,
"loss": 0.9166,
"step": 730
},
{
"epoch": 2.291021671826625,
"grad_norm": 1.96875,
"learning_rate": 0.0001,
"loss": 0.711,
"step": 740
},
{
"epoch": 2.321981424148607,
"grad_norm": 0.98828125,
"learning_rate": 0.0001,
"loss": 1.0771,
"step": 750
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.86328125,
"learning_rate": 0.0001,
"loss": 1.4724,
"step": 760
},
{
"epoch": 2.3839009287925697,
"grad_norm": 1.078125,
"learning_rate": 0.0001,
"loss": 1.1995,
"step": 770
},
{
"epoch": 2.414860681114551,
"grad_norm": 1.296875,
"learning_rate": 0.0001,
"loss": 0.886,
"step": 780
},
{
"epoch": 2.4458204334365323,
"grad_norm": 1.8984375,
"learning_rate": 0.0001,
"loss": 0.6432,
"step": 790
},
{
"epoch": 2.476780185758514,
"grad_norm": 1.125,
"learning_rate": 0.0001,
"loss": 1.0749,
"step": 800
},
{
"epoch": 2.5077399380804954,
"grad_norm": 0.8828125,
"learning_rate": 0.0001,
"loss": 1.4332,
"step": 810
},
{
"epoch": 2.538699690402477,
"grad_norm": 1.2109375,
"learning_rate": 0.0001,
"loss": 1.2049,
"step": 820
},
{
"epoch": 2.569659442724458,
"grad_norm": 1.65625,
"learning_rate": 0.0001,
"loss": 0.9171,
"step": 830
},
{
"epoch": 2.6006191950464395,
"grad_norm": 1.7421875,
"learning_rate": 0.0001,
"loss": 0.7159,
"step": 840
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.87109375,
"learning_rate": 0.0001,
"loss": 1.0601,
"step": 850
},
{
"epoch": 2.6625386996904026,
"grad_norm": 0.8515625,
"learning_rate": 0.0001,
"loss": 1.4824,
"step": 860
},
{
"epoch": 2.693498452012384,
"grad_norm": 1.1328125,
"learning_rate": 0.0001,
"loss": 1.2073,
"step": 870
},
{
"epoch": 2.7244582043343653,
"grad_norm": 1.6015625,
"learning_rate": 0.0001,
"loss": 0.9479,
"step": 880
},
{
"epoch": 2.7554179566563466,
"grad_norm": 1.609375,
"learning_rate": 0.0001,
"loss": 0.7719,
"step": 890
},
{
"epoch": 2.7863777089783284,
"grad_norm": 0.97265625,
"learning_rate": 0.0001,
"loss": 1.1318,
"step": 900
},
{
"epoch": 2.8173374613003097,
"grad_norm": 0.84375,
"learning_rate": 0.0001,
"loss": 1.4771,
"step": 910
},
{
"epoch": 2.848297213622291,
"grad_norm": 1.03125,
"learning_rate": 0.0001,
"loss": 1.1876,
"step": 920
},
{
"epoch": 2.8792569659442724,
"grad_norm": 1.4921875,
"learning_rate": 0.0001,
"loss": 0.9698,
"step": 930
},
{
"epoch": 2.9102167182662537,
"grad_norm": 1.6875,
"learning_rate": 0.0001,
"loss": 0.7357,
"step": 940
},
{
"epoch": 2.9411764705882355,
"grad_norm": 1.375,
"learning_rate": 0.0001,
"loss": 1.0427,
"step": 950
},
{
"epoch": 2.972136222910217,
"grad_norm": 1.1796875,
"learning_rate": 0.0001,
"loss": 1.1243,
"step": 960
},
{
"epoch": 3.003095975232198,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 0.7874,
"step": 970
},
{
"epoch": 3.0340557275541795,
"grad_norm": 1.578125,
"learning_rate": 0.0001,
"loss": 1.1201,
"step": 980
},
{
"epoch": 3.065015479876161,
"grad_norm": 1.1953125,
"learning_rate": 0.0001,
"loss": 0.6787,
"step": 990
},
{
"epoch": 3.0959752321981426,
"grad_norm": 1.4921875,
"learning_rate": 0.0001,
"loss": 0.4248,
"step": 1000
},
{
"epoch": 3.126934984520124,
"grad_norm": 1.640625,
"learning_rate": 0.0001,
"loss": 0.3441,
"step": 1010
},
{
"epoch": 3.1578947368421053,
"grad_norm": 0.8203125,
"learning_rate": 0.0001,
"loss": 0.3431,
"step": 1020
},
{
"epoch": 3.1888544891640866,
"grad_norm": 1.2265625,
"learning_rate": 0.0001,
"loss": 1.0994,
"step": 1030
},
{
"epoch": 3.219814241486068,
"grad_norm": 1.4765625,
"learning_rate": 0.0001,
"loss": 0.6538,
"step": 1040
},
{
"epoch": 3.2507739938080498,
"grad_norm": 1.9140625,
"learning_rate": 0.0001,
"loss": 0.4197,
"step": 1050
},
{
"epoch": 3.281733746130031,
"grad_norm": 1.984375,
"learning_rate": 0.0001,
"loss": 0.322,
"step": 1060
},
{
"epoch": 3.3126934984520124,
"grad_norm": 0.83203125,
"learning_rate": 0.0001,
"loss": 0.382,
"step": 1070
},
{
"epoch": 3.343653250773994,
"grad_norm": 1.1171875,
"learning_rate": 0.0001,
"loss": 1.1316,
"step": 1080
},
{
"epoch": 3.374613003095975,
"grad_norm": 1.7734375,
"learning_rate": 0.0001,
"loss": 0.7036,
"step": 1090
},
{
"epoch": 3.405572755417957,
"grad_norm": 1.4296875,
"learning_rate": 0.0001,
"loss": 0.4367,
"step": 1100
},
{
"epoch": 3.4365325077399382,
"grad_norm": 2.078125,
"learning_rate": 0.0001,
"loss": 0.2773,
"step": 1110
},
{
"epoch": 3.4674922600619196,
"grad_norm": 0.9375,
"learning_rate": 0.0001,
"loss": 0.3452,
"step": 1120
},
{
"epoch": 3.498452012383901,
"grad_norm": 1.21875,
"learning_rate": 0.0001,
"loss": 1.0893,
"step": 1130
},
{
"epoch": 3.5294117647058822,
"grad_norm": 1.1796875,
"learning_rate": 0.0001,
"loss": 0.6688,
"step": 1140
},
{
"epoch": 3.560371517027864,
"grad_norm": 1.7109375,
"learning_rate": 0.0001,
"loss": 0.4231,
"step": 1150
},
{
"epoch": 3.5913312693498454,
"grad_norm": 1.890625,
"learning_rate": 0.0001,
"loss": 0.3495,
"step": 1160
},
{
"epoch": 3.6222910216718267,
"grad_norm": 0.8828125,
"learning_rate": 0.0001,
"loss": 0.3711,
"step": 1170
},
{
"epoch": 3.653250773993808,
"grad_norm": 1.3046875,
"learning_rate": 0.0001,
"loss": 1.0888,
"step": 1180
},
{
"epoch": 3.6842105263157894,
"grad_norm": 1.484375,
"learning_rate": 0.0001,
"loss": 0.6578,
"step": 1190
},
{
"epoch": 3.715170278637771,
"grad_norm": 1.390625,
"learning_rate": 0.0001,
"loss": 0.4095,
"step": 1200
},
{
"epoch": 3.746130030959752,
"grad_norm": 1.671875,
"learning_rate": 0.0001,
"loss": 0.3025,
"step": 1210
},
{
"epoch": 3.777089783281734,
"grad_norm": 1.078125,
"learning_rate": 0.0001,
"loss": 0.3888,
"step": 1220
},
{
"epoch": 3.808049535603715,
"grad_norm": 1.3359375,
"learning_rate": 0.0001,
"loss": 1.1246,
"step": 1230
},
{
"epoch": 3.8390092879256965,
"grad_norm": 1.375,
"learning_rate": 0.0001,
"loss": 0.7352,
"step": 1240
},
{
"epoch": 3.8699690402476783,
"grad_norm": 1.5703125,
"learning_rate": 0.0001,
"loss": 0.4738,
"step": 1250
},
{
"epoch": 3.900928792569659,
"grad_norm": 1.6015625,
"learning_rate": 0.0001,
"loss": 0.3303,
"step": 1260
},
{
"epoch": 3.931888544891641,
"grad_norm": 1.015625,
"learning_rate": 0.0001,
"loss": 0.3753,
"step": 1270
},
{
"epoch": 3.9628482972136223,
"grad_norm": 1.46875,
"learning_rate": 0.0001,
"loss": 0.8517,
"step": 1280
},
{
"epoch": 3.9938080495356036,
"grad_norm": 1.4140625,
"learning_rate": 0.0001,
"loss": 0.3625,
"step": 1290
},
{
"epoch": 4.024767801857585,
"grad_norm": 1.5,
"learning_rate": 0.0001,
"loss": 0.6243,
"step": 1300
},
{
"epoch": 4.055727554179566,
"grad_norm": 1.1328125,
"learning_rate": 0.0001,
"loss": 0.3312,
"step": 1310
},
{
"epoch": 4.086687306501548,
"grad_norm": 1.046875,
"learning_rate": 0.0001,
"loss": 0.1841,
"step": 1320
},
{
"epoch": 4.117647058823529,
"grad_norm": 1.4453125,
"learning_rate": 0.0001,
"loss": 0.1307,
"step": 1330
},
{
"epoch": 4.148606811145511,
"grad_norm": 1.84375,
"learning_rate": 0.0001,
"loss": 0.1003,
"step": 1340
},
{
"epoch": 4.179566563467493,
"grad_norm": 1.421875,
"learning_rate": 0.0001,
"loss": 0.6324,
"step": 1350
},
{
"epoch": 4.2105263157894735,
"grad_norm": 1.625,
"learning_rate": 0.0001,
"loss": 0.3237,
"step": 1360
},
{
"epoch": 4.241486068111455,
"grad_norm": 1.15625,
"learning_rate": 0.0001,
"loss": 0.1651,
"step": 1370
},
{
"epoch": 4.272445820433436,
"grad_norm": 0.99609375,
"learning_rate": 0.0001,
"loss": 0.1206,
"step": 1380
},
{
"epoch": 4.303405572755418,
"grad_norm": 2.171875,
"learning_rate": 0.0001,
"loss": 0.1298,
"step": 1390
},
{
"epoch": 4.3343653250774,
"grad_norm": 1.2734375,
"learning_rate": 0.0001,
"loss": 0.5857,
"step": 1400
},
{
"epoch": 4.365325077399381,
"grad_norm": 1.2109375,
"learning_rate": 0.0001,
"loss": 0.3743,
"step": 1410
},
{
"epoch": 4.396284829721362,
"grad_norm": 1.2421875,
"learning_rate": 0.0001,
"loss": 0.2053,
"step": 1420
},
{
"epoch": 4.427244582043343,
"grad_norm": 1.703125,
"learning_rate": 0.0001,
"loss": 0.1376,
"step": 1430
},
{
"epoch": 4.458204334365325,
"grad_norm": 1.2578125,
"learning_rate": 0.0001,
"loss": 0.1043,
"step": 1440
},
{
"epoch": 4.489164086687307,
"grad_norm": 1.234375,
"learning_rate": 0.0001,
"loss": 0.6281,
"step": 1450
},
{
"epoch": 4.520123839009288,
"grad_norm": 1.640625,
"learning_rate": 0.0001,
"loss": 0.3621,
"step": 1460
},
{
"epoch": 4.5510835913312695,
"grad_norm": 1.1796875,
"learning_rate": 0.0001,
"loss": 0.2105,
"step": 1470
},
{
"epoch": 4.58204334365325,
"grad_norm": 1.5625,
"learning_rate": 0.0001,
"loss": 0.1891,
"step": 1480
},
{
"epoch": 4.613003095975232,
"grad_norm": 0.98046875,
"learning_rate": 0.0001,
"loss": 0.1215,
"step": 1490
},
{
"epoch": 4.643962848297214,
"grad_norm": 1.4375,
"learning_rate": 0.0001,
"loss": 0.6574,
"step": 1500
},
{
"epoch": 4.674922600619195,
"grad_norm": 1.8515625,
"learning_rate": 0.0001,
"loss": 0.3907,
"step": 1510
},
{
"epoch": 4.705882352941177,
"grad_norm": 1.4375,
"learning_rate": 0.0001,
"loss": 0.2264,
"step": 1520
},
{
"epoch": 4.7368421052631575,
"grad_norm": 1.4375,
"learning_rate": 0.0001,
"loss": 0.1498,
"step": 1530
},
{
"epoch": 4.767801857585139,
"grad_norm": 1.375,
"learning_rate": 0.0001,
"loss": 0.1238,
"step": 1540
},
{
"epoch": 4.798761609907121,
"grad_norm": 1.359375,
"learning_rate": 0.0001,
"loss": 0.65,
"step": 1550
},
{
"epoch": 4.829721362229102,
"grad_norm": 1.609375,
"learning_rate": 0.0001,
"loss": 0.3445,
"step": 1560
},
{
"epoch": 4.860681114551084,
"grad_norm": 1.1328125,
"learning_rate": 0.0001,
"loss": 0.2066,
"step": 1570
},
{
"epoch": 4.891640866873065,
"grad_norm": 1.140625,
"learning_rate": 0.0001,
"loss": 0.143,
"step": 1580
},
{
"epoch": 4.922600619195046,
"grad_norm": 1.4921875,
"learning_rate": 0.0001,
"loss": 0.1308,
"step": 1590
},
{
"epoch": 4.953560371517028,
"grad_norm": 1.453125,
"learning_rate": 0.0001,
"loss": 0.4828,
"step": 1600
},
{
"epoch": 4.984520123839009,
"grad_norm": 1.5390625,
"learning_rate": 0.0001,
"loss": 0.1781,
"step": 1610
},
{
"epoch": 5.015479876160991,
"grad_norm": 1.0546875,
"learning_rate": 0.0001,
"loss": 0.3468,
"step": 1620
},
{
"epoch": 5.046439628482972,
"grad_norm": 1.7734375,
"learning_rate": 0.0001,
"loss": 0.1674,
"step": 1630
},
{
"epoch": 5.077399380804954,
"grad_norm": 1.6015625,
"learning_rate": 0.0001,
"loss": 0.0983,
"step": 1640
},
{
"epoch": 5.108359133126935,
"grad_norm": 0.91796875,
"learning_rate": 0.0001,
"loss": 0.0648,
"step": 1650
},
{
"epoch": 5.139318885448916,
"grad_norm": 1.765625,
"learning_rate": 0.0001,
"loss": 0.0799,
"step": 1660
},
{
"epoch": 5.170278637770898,
"grad_norm": 1.125,
"learning_rate": 0.0001,
"loss": 0.2795,
"step": 1670
},
{
"epoch": 5.201238390092879,
"grad_norm": 1.453125,
"learning_rate": 0.0001,
"loss": 0.1749,
"step": 1680
},
{
"epoch": 5.232198142414861,
"grad_norm": 1.40625,
"learning_rate": 0.0001,
"loss": 0.1115,
"step": 1690
},
{
"epoch": 5.2631578947368425,
"grad_norm": 1.0859375,
"learning_rate": 0.0001,
"loss": 0.0732,
"step": 1700
},
{
"epoch": 5.294117647058823,
"grad_norm": 0.64453125,
"learning_rate": 0.0001,
"loss": 0.0614,
"step": 1710
},
{
"epoch": 5.325077399380805,
"grad_norm": 1.21875,
"learning_rate": 0.0001,
"loss": 0.29,
"step": 1720
},
{
"epoch": 5.356037151702786,
"grad_norm": 1.59375,
"learning_rate": 0.0001,
"loss": 0.1984,
"step": 1730
},
{
"epoch": 5.386996904024768,
"grad_norm": 0.79296875,
"learning_rate": 0.0001,
"loss": 0.0959,
"step": 1740
},
{
"epoch": 5.41795665634675,
"grad_norm": 1.0546875,
"learning_rate": 0.0001,
"loss": 0.075,
"step": 1750
},
{
"epoch": 5.4489164086687305,
"grad_norm": 1.65625,
"learning_rate": 0.0001,
"loss": 0.0607,
"step": 1760
},
{
"epoch": 5.479876160990712,
"grad_norm": 1.328125,
"learning_rate": 0.0001,
"loss": 0.2815,
"step": 1770
},
{
"epoch": 5.510835913312693,
"grad_norm": 1.4609375,
"learning_rate": 0.0001,
"loss": 0.1861,
"step": 1780
},
{
"epoch": 5.541795665634675,
"grad_norm": 1.1484375,
"learning_rate": 0.0001,
"loss": 0.1012,
"step": 1790
},
{
"epoch": 5.572755417956657,
"grad_norm": 1.3359375,
"learning_rate": 0.0001,
"loss": 0.067,
"step": 1800
},
{
"epoch": 5.603715170278638,
"grad_norm": 1.6484375,
"learning_rate": 0.0001,
"loss": 0.0749,
"step": 1810
},
{
"epoch": 5.634674922600619,
"grad_norm": 1.1015625,
"learning_rate": 0.0001,
"loss": 0.2726,
"step": 1820
},
{
"epoch": 5.6656346749226,
"grad_norm": 2.15625,
"learning_rate": 0.0001,
"loss": 0.1799,
"step": 1830
},
{
"epoch": 5.696594427244582,
"grad_norm": 1.0078125,
"learning_rate": 0.0001,
"loss": 0.1021,
"step": 1840
},
{
"epoch": 5.727554179566564,
"grad_norm": 0.7578125,
"learning_rate": 0.0001,
"loss": 0.0691,
"step": 1850
},
{
"epoch": 5.758513931888545,
"grad_norm": 1.0703125,
"learning_rate": 0.0001,
"loss": 0.0645,
"step": 1860
},
{
"epoch": 5.7894736842105265,
"grad_norm": 1.3125,
"learning_rate": 0.0001,
"loss": 0.3198,
"step": 1870
},
{
"epoch": 5.820433436532507,
"grad_norm": 1.390625,
"learning_rate": 0.0001,
"loss": 0.218,
"step": 1880
},
{
"epoch": 5.851393188854489,
"grad_norm": 1.359375,
"learning_rate": 0.0001,
"loss": 0.1078,
"step": 1890
},
{
"epoch": 5.882352941176471,
"grad_norm": 0.984375,
"learning_rate": 0.0001,
"loss": 0.0751,
"step": 1900
},
{
"epoch": 5.913312693498452,
"grad_norm": 1.2265625,
"learning_rate": 0.0001,
"loss": 0.0749,
"step": 1910
},
{
"epoch": 5.944272445820434,
"grad_norm": 1.546875,
"learning_rate": 0.0001,
"loss": 0.2301,
"step": 1920
},
{
"epoch": 5.975232198142415,
"grad_norm": 1.8828125,
"learning_rate": 0.0001,
"loss": 0.1302,
"step": 1930
},
{
"epoch": 6.006191950464396,
"grad_norm": 0.98046875,
"learning_rate": 0.0001,
"loss": 0.1467,
"step": 1940
},
{
"epoch": 6.037151702786378,
"grad_norm": 0.953125,
"learning_rate": 0.0001,
"loss": 0.1079,
"step": 1950
},
{
"epoch": 6.068111455108359,
"grad_norm": 1.9765625,
"learning_rate": 0.0001,
"loss": 0.0593,
"step": 1960
},
{
"epoch": 6.099071207430341,
"grad_norm": 1.0546875,
"learning_rate": 0.0001,
"loss": 0.0459,
"step": 1970
},
{
"epoch": 6.130030959752322,
"grad_norm": 1.0078125,
"learning_rate": 0.0001,
"loss": 0.0337,
"step": 1980
},
{
"epoch": 6.1609907120743035,
"grad_norm": 1.2734375,
"learning_rate": 0.0001,
"loss": 0.1309,
"step": 1990
},
{
"epoch": 6.191950464396285,
"grad_norm": 0.80078125,
"learning_rate": 0.0001,
"loss": 0.1197,
"step": 2000
},
{
"epoch": 6.222910216718266,
"grad_norm": 1.484375,
"learning_rate": 0.0001,
"loss": 0.0668,
"step": 2010
},
{
"epoch": 6.253869969040248,
"grad_norm": 0.95703125,
"learning_rate": 0.0001,
"loss": 0.0569,
"step": 2020
},
{
"epoch": 6.284829721362229,
"grad_norm": 1.4765625,
"learning_rate": 0.0001,
"loss": 0.0432,
"step": 2030
},
{
"epoch": 6.315789473684211,
"grad_norm": 0.9453125,
"learning_rate": 0.0001,
"loss": 0.1081,
"step": 2040
},
{
"epoch": 6.346749226006192,
"grad_norm": 1.1015625,
"learning_rate": 0.0001,
"loss": 0.1104,
"step": 2050
},
{
"epoch": 6.377708978328173,
"grad_norm": 1.7421875,
"learning_rate": 0.0001,
"loss": 0.0799,
"step": 2060
},
{
"epoch": 6.408668730650155,
"grad_norm": 1.1640625,
"learning_rate": 0.0001,
"loss": 0.0499,
"step": 2070
},
{
"epoch": 6.439628482972136,
"grad_norm": 0.82421875,
"learning_rate": 0.0001,
"loss": 0.0472,
"step": 2080
},
{
"epoch": 6.470588235294118,
"grad_norm": 1.2109375,
"learning_rate": 0.0001,
"loss": 0.1062,
"step": 2090
},
{
"epoch": 6.5015479876160995,
"grad_norm": 1.265625,
"learning_rate": 0.0001,
"loss": 0.1386,
"step": 2100
},
{
"epoch": 6.53250773993808,
"grad_norm": 1.734375,
"learning_rate": 0.0001,
"loss": 0.0756,
"step": 2110
},
{
"epoch": 6.563467492260062,
"grad_norm": 1.1953125,
"learning_rate": 0.0001,
"loss": 0.0602,
"step": 2120
},
{
"epoch": 6.594427244582043,
"grad_norm": 1.640625,
"learning_rate": 0.0001,
"loss": 0.0422,
"step": 2130
},
{
"epoch": 6.625386996904025,
"grad_norm": 1.28125,
"learning_rate": 0.0001,
"loss": 0.1138,
"step": 2140
},
{
"epoch": 6.656346749226007,
"grad_norm": 1.078125,
"learning_rate": 0.0001,
"loss": 0.1462,
"step": 2150
},
{
"epoch": 6.687306501547988,
"grad_norm": 1.3125,
"learning_rate": 0.0001,
"loss": 0.0758,
"step": 2160
},
{
"epoch": 6.718266253869969,
"grad_norm": 1.5625,
"learning_rate": 0.0001,
"loss": 0.0621,
"step": 2170
},
{
"epoch": 6.74922600619195,
"grad_norm": 2.4375,
"learning_rate": 0.0001,
"loss": 0.0544,
"step": 2180
},
{
"epoch": 6.780185758513932,
"grad_norm": 1.265625,
"learning_rate": 0.0001,
"loss": 0.1573,
"step": 2190
},
{
"epoch": 6.811145510835914,
"grad_norm": 1.0078125,
"learning_rate": 0.0001,
"loss": 0.1575,
"step": 2200
},
{
"epoch": 6.842105263157895,
"grad_norm": 1.5625,
"learning_rate": 0.0001,
"loss": 0.0724,
"step": 2210
},
{
"epoch": 6.8730650154798765,
"grad_norm": 1.109375,
"learning_rate": 0.0001,
"loss": 0.0658,
"step": 2220
},
{
"epoch": 6.904024767801857,
"grad_norm": 0.9921875,
"learning_rate": 0.0001,
"loss": 0.0435,
"step": 2230
},
{
"epoch": 6.934984520123839,
"grad_norm": 1.296875,
"learning_rate": 0.0001,
"loss": 0.0894,
"step": 2240
},
{
"epoch": 6.965944272445821,
"grad_norm": 1.34375,
"learning_rate": 0.0001,
"loss": 0.1117,
"step": 2250
},
{
"epoch": 6.996904024767802,
"grad_norm": 1.28125,
"learning_rate": 0.0001,
"loss": 0.0593,
"step": 2260
},
{
"epoch": 7.027863777089784,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 0.0889,
"step": 2270
},
{
"epoch": 7.0588235294117645,
"grad_norm": 1.0078125,
"learning_rate": 0.0001,
"loss": 0.0476,
"step": 2280
},
{
"epoch": 7.089783281733746,
"grad_norm": 1.078125,
"learning_rate": 0.0001,
"loss": 0.0382,
"step": 2290
},
{
"epoch": 7.120743034055727,
"grad_norm": 1.8046875,
"learning_rate": 0.0001,
"loss": 0.0366,
"step": 2300
},
{
"epoch": 7.151702786377709,
"grad_norm": 0.78515625,
"learning_rate": 0.0001,
"loss": 0.0277,
"step": 2310
},
{
"epoch": 7.182662538699691,
"grad_norm": 1.0078125,
"learning_rate": 0.0001,
"loss": 0.1059,
"step": 2320
},
{
"epoch": 7.213622291021672,
"grad_norm": 1.0546875,
"learning_rate": 0.0001,
"loss": 0.0564,
"step": 2330
},
{
"epoch": 7.244582043343653,
"grad_norm": 1.1171875,
"learning_rate": 0.0001,
"loss": 0.0435,
"step": 2340
},
{
"epoch": 7.275541795665634,
"grad_norm": 1.2109375,
"learning_rate": 0.0001,
"loss": 0.0381,
"step": 2350
},
{
"epoch": 7.306501547987616,
"grad_norm": 1.421875,
"learning_rate": 0.0001,
"loss": 0.0567,
"step": 2360
},
{
"epoch": 7.337461300309598,
"grad_norm": 0.80078125,
"learning_rate": 0.0001,
"loss": 0.1014,
"step": 2370
},
{
"epoch": 7.368421052631579,
"grad_norm": 1.0234375,
"learning_rate": 0.0001,
"loss": 0.0596,
"step": 2380
},
{
"epoch": 7.3993808049535605,
"grad_norm": 1.625,
"learning_rate": 0.0001,
"loss": 0.047,
"step": 2390
},
{
"epoch": 7.430340557275541,
"grad_norm": 1.1015625,
"learning_rate": 0.0001,
"loss": 0.0298,
"step": 2400
},
{
"epoch": 7.461300309597523,
"grad_norm": 2.234375,
"learning_rate": 0.0001,
"loss": 0.0365,
"step": 2410
},
{
"epoch": 7.492260061919505,
"grad_norm": 0.76171875,
"learning_rate": 0.0001,
"loss": 0.1112,
"step": 2420
},
{
"epoch": 7.523219814241486,
"grad_norm": 0.84375,
"learning_rate": 0.0001,
"loss": 0.0604,
"step": 2430
},
{
"epoch": 7.554179566563468,
"grad_norm": 1.9296875,
"learning_rate": 0.0001,
"loss": 0.0465,
"step": 2440
},
{
"epoch": 7.585139318885449,
"grad_norm": 1.546875,
"learning_rate": 0.0001,
"loss": 0.046,
"step": 2450
},
{
"epoch": 7.61609907120743,
"grad_norm": 0.4921875,
"learning_rate": 0.0001,
"loss": 0.0346,
"step": 2460
},
{
"epoch": 7.647058823529412,
"grad_norm": 0.9453125,
"learning_rate": 0.0001,
"loss": 0.1163,
"step": 2470
},
{
"epoch": 7.678018575851393,
"grad_norm": 1.3984375,
"learning_rate": 0.0001,
"loss": 0.058,
"step": 2480
},
{
"epoch": 7.708978328173375,
"grad_norm": 0.66015625,
"learning_rate": 0.0001,
"loss": 0.0476,
"step": 2490
},
{
"epoch": 7.739938080495356,
"grad_norm": 0.76171875,
"learning_rate": 0.0001,
"loss": 0.0487,
"step": 2500
},
{
"epoch": 7.7708978328173375,
"grad_norm": 1.6640625,
"learning_rate": 0.0001,
"loss": 0.0327,
"step": 2510
},
{
"epoch": 7.801857585139319,
"grad_norm": 0.93359375,
"learning_rate": 0.0001,
"loss": 0.1199,
"step": 2520
},
{
"epoch": 7.8328173374613,
"grad_norm": 1.234375,
"learning_rate": 0.0001,
"loss": 0.0639,
"step": 2530
},
{
"epoch": 7.863777089783282,
"grad_norm": 1.1015625,
"learning_rate": 0.0001,
"loss": 0.0535,
"step": 2540
},
{
"epoch": 7.894736842105263,
"grad_norm": 1.2109375,
"learning_rate": 0.0001,
"loss": 0.0386,
"step": 2550
},
{
"epoch": 7.925696594427245,
"grad_norm": 2.5,
"learning_rate": 0.0001,
"loss": 0.0523,
"step": 2560
},
{
"epoch": 7.956656346749226,
"grad_norm": 1.3046875,
"learning_rate": 0.0001,
"loss": 0.1184,
"step": 2570
},
{
"epoch": 7.987616099071207,
"grad_norm": 1.1796875,
"learning_rate": 0.0001,
"loss": 0.0558,
"step": 2580
},
{
"epoch": 8.018575851393189,
"grad_norm": 0.94921875,
"learning_rate": 0.0001,
"loss": 0.0601,
"step": 2590
},
{
"epoch": 8.04953560371517,
"grad_norm": 0.78125,
"learning_rate": 0.0001,
"loss": 0.0375,
"step": 2600
},
{
"epoch": 8.08049535603715,
"grad_norm": 0.7265625,
"learning_rate": 0.0001,
"loss": 0.0317,
"step": 2610
},
{
"epoch": 8.111455108359133,
"grad_norm": 0.9921875,
"learning_rate": 0.0001,
"loss": 0.0322,
"step": 2620
},
{
"epoch": 8.142414860681114,
"grad_norm": 0.376953125,
"learning_rate": 0.0001,
"loss": 0.024,
"step": 2630
},
{
"epoch": 8.173374613003096,
"grad_norm": 1.078125,
"learning_rate": 0.0001,
"loss": 0.061,
"step": 2640
},
{
"epoch": 8.204334365325078,
"grad_norm": 1.3984375,
"learning_rate": 0.0001,
"loss": 0.0417,
"step": 2650
},
{
"epoch": 8.235294117647058,
"grad_norm": 1.421875,
"learning_rate": 0.0001,
"loss": 0.0312,
"step": 2660
},
{
"epoch": 8.26625386996904,
"grad_norm": 1.375,
"learning_rate": 0.0001,
"loss": 0.0365,
"step": 2670
},
{
"epoch": 8.297213622291022,
"grad_norm": 0.466796875,
"learning_rate": 0.0001,
"loss": 0.025,
"step": 2680
},
{
"epoch": 8.328173374613003,
"grad_norm": 0.91015625,
"learning_rate": 0.0001,
"loss": 0.0621,
"step": 2690
},
{
"epoch": 8.359133126934985,
"grad_norm": 1.0,
"learning_rate": 0.0001,
"loss": 0.0552,
"step": 2700
},
{
"epoch": 8.390092879256965,
"grad_norm": 1.4765625,
"learning_rate": 0.0001,
"loss": 0.0391,
"step": 2710
},
{
"epoch": 8.421052631578947,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 0.0272,
"step": 2720
},
{
"epoch": 8.452012383900929,
"grad_norm": 1.28125,
"learning_rate": 0.0001,
"loss": 0.0184,
"step": 2730
},
{
"epoch": 8.48297213622291,
"grad_norm": 0.953125,
"learning_rate": 0.0001,
"loss": 0.071,
"step": 2740
},
{
"epoch": 8.513931888544892,
"grad_norm": 0.9140625,
"learning_rate": 0.0001,
"loss": 0.0484,
"step": 2750
},
{
"epoch": 8.544891640866872,
"grad_norm": 1.140625,
"learning_rate": 0.0001,
"loss": 0.0353,
"step": 2760
},
{
"epoch": 8.575851393188854,
"grad_norm": 1.390625,
"learning_rate": 0.0001,
"loss": 0.0363,
"step": 2770
},
{
"epoch": 8.606811145510836,
"grad_norm": 2.53125,
"learning_rate": 0.0001,
"loss": 0.0356,
"step": 2780
},
{
"epoch": 8.637770897832818,
"grad_norm": 0.98046875,
"learning_rate": 0.0001,
"loss": 0.0819,
"step": 2790
},
{
"epoch": 8.6687306501548,
"grad_norm": 1.359375,
"learning_rate": 0.0001,
"loss": 0.057,
"step": 2800
},
{
"epoch": 8.69969040247678,
"grad_norm": 0.8671875,
"learning_rate": 0.0001,
"loss": 0.0374,
"step": 2810
},
{
"epoch": 8.730650154798761,
"grad_norm": 1.1796875,
"learning_rate": 0.0001,
"loss": 0.0262,
"step": 2820
},
{
"epoch": 8.761609907120743,
"grad_norm": 0.9453125,
"learning_rate": 0.0001,
"loss": 0.0347,
"step": 2830
},
{
"epoch": 8.792569659442725,
"grad_norm": 0.90625,
"learning_rate": 0.0001,
"loss": 0.0593,
"step": 2840
},
{
"epoch": 8.823529411764707,
"grad_norm": 1.0390625,
"learning_rate": 0.0001,
"loss": 0.0566,
"step": 2850
},
{
"epoch": 8.854489164086687,
"grad_norm": 0.765625,
"learning_rate": 0.0001,
"loss": 0.0392,
"step": 2860
},
{
"epoch": 8.885448916408668,
"grad_norm": 1.3984375,
"learning_rate": 0.0001,
"loss": 0.0423,
"step": 2870
},
{
"epoch": 8.91640866873065,
"grad_norm": 1.6796875,
"learning_rate": 0.0001,
"loss": 0.0414,
"step": 2880
},
{
"epoch": 8.947368421052632,
"grad_norm": 1.3515625,
"learning_rate": 0.0001,
"loss": 0.0644,
"step": 2890
},
{
"epoch": 8.978328173374614,
"grad_norm": 1.2421875,
"learning_rate": 0.0001,
"loss": 0.0445,
"step": 2900
},
{
"epoch": 9.009287925696594,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 0.039,
"step": 2910
},
{
"epoch": 9.040247678018575,
"grad_norm": 0.443359375,
"learning_rate": 0.0001,
"loss": 0.0326,
"step": 2920
},
{
"epoch": 9.071207430340557,
"grad_norm": 1.046875,
"learning_rate": 0.0001,
"loss": 0.0283,
"step": 2930
},
{
"epoch": 9.102167182662539,
"grad_norm": 1.1640625,
"learning_rate": 0.0001,
"loss": 0.0239,
"step": 2940
},
{
"epoch": 9.13312693498452,
"grad_norm": 0.87109375,
"learning_rate": 0.0001,
"loss": 0.023,
"step": 2950
},
{
"epoch": 9.1640866873065,
"grad_norm": 0.87890625,
"learning_rate": 0.0001,
"loss": 0.0315,
"step": 2960
},
{
"epoch": 9.195046439628483,
"grad_norm": 0.80859375,
"learning_rate": 0.0001,
"loss": 0.0473,
"step": 2970
},
{
"epoch": 9.226006191950464,
"grad_norm": 1.0703125,
"learning_rate": 0.0001,
"loss": 0.0423,
"step": 2980
},
{
"epoch": 9.256965944272446,
"grad_norm": 1.7421875,
"learning_rate": 0.0001,
"loss": 0.0269,
"step": 2990
},
{
"epoch": 9.287925696594428,
"grad_norm": 0.7421875,
"learning_rate": 0.0001,
"loss": 0.0239,
"step": 3000
},
{
"epoch": 9.318885448916408,
"grad_norm": 0.85546875,
"learning_rate": 0.0001,
"loss": 0.057,
"step": 3010
},
{
"epoch": 9.34984520123839,
"grad_norm": 1.15625,
"learning_rate": 0.0001,
"loss": 0.048,
"step": 3020
},
{
"epoch": 9.380804953560371,
"grad_norm": 0.8359375,
"learning_rate": 0.0001,
"loss": 0.0347,
"step": 3030
},
{
"epoch": 9.411764705882353,
"grad_norm": 1.5625,
"learning_rate": 0.0001,
"loss": 0.0284,
"step": 3040
},
{
"epoch": 9.442724458204335,
"grad_norm": 0.625,
"learning_rate": 0.0001,
"loss": 0.0276,
"step": 3050
},
{
"epoch": 9.473684210526315,
"grad_norm": 0.8515625,
"learning_rate": 0.0001,
"loss": 0.0434,
"step": 3060
},
{
"epoch": 9.504643962848297,
"grad_norm": 0.96484375,
"learning_rate": 0.0001,
"loss": 0.0446,
"step": 3070
},
{
"epoch": 9.535603715170279,
"grad_norm": 0.734375,
"learning_rate": 0.0001,
"loss": 0.0377,
"step": 3080
},
{
"epoch": 9.56656346749226,
"grad_norm": 1.1953125,
"learning_rate": 0.0001,
"loss": 0.046,
"step": 3090
},
{
"epoch": 9.597523219814242,
"grad_norm": 0.265625,
"learning_rate": 0.0001,
"loss": 0.0203,
"step": 3100
},
{
"epoch": 9.628482972136222,
"grad_norm": 1.046875,
"learning_rate": 0.0001,
"loss": 0.0403,
"step": 3110
},
{
"epoch": 9.659442724458204,
"grad_norm": 1.15625,
"learning_rate": 0.0001,
"loss": 0.0488,
"step": 3120
},
{
"epoch": 9.690402476780186,
"grad_norm": 0.77734375,
"learning_rate": 0.0001,
"loss": 0.0418,
"step": 3130
},
{
"epoch": 9.721362229102168,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 0.0314,
"step": 3140
},
{
"epoch": 9.75232198142415,
"grad_norm": 1.0234375,
"learning_rate": 0.0001,
"loss": 0.0369,
"step": 3150
},
{
"epoch": 9.78328173374613,
"grad_norm": 0.78515625,
"learning_rate": 0.0001,
"loss": 0.047,
"step": 3160
},
{
"epoch": 9.814241486068111,
"grad_norm": 0.9765625,
"learning_rate": 0.0001,
"loss": 0.053,
"step": 3170
},
{
"epoch": 9.845201238390093,
"grad_norm": 1.3125,
"learning_rate": 0.0001,
"loss": 0.0413,
"step": 3180
},
{
"epoch": 9.876160990712075,
"grad_norm": 0.98828125,
"learning_rate": 0.0001,
"loss": 0.0332,
"step": 3190
},
{
"epoch": 9.907120743034056,
"grad_norm": 1.328125,
"learning_rate": 0.0001,
"loss": 0.036,
"step": 3200
},
{
"epoch": 9.938080495356036,
"grad_norm": 1.125,
"learning_rate": 0.0001,
"loss": 0.0494,
"step": 3210
},
{
"epoch": 9.969040247678018,
"grad_norm": 1.3046875,
"learning_rate": 0.0001,
"loss": 0.0505,
"step": 3220
},
{
"epoch": 10.0,
"grad_norm": 7.59375,
"learning_rate": 0.0001,
"loss": 0.0234,
"step": 3230
},
{
"epoch": 10.030959752321982,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 0.0444,
"step": 3240
},
{
"epoch": 10.061919504643964,
"grad_norm": 0.8828125,
"learning_rate": 0.0001,
"loss": 0.0252,
"step": 3250
},
{
"epoch": 10.092879256965944,
"grad_norm": 0.86328125,
"learning_rate": 0.0001,
"loss": 0.0302,
"step": 3260
},
{
"epoch": 10.123839009287925,
"grad_norm": 0.47265625,
"learning_rate": 0.0001,
"loss": 0.0198,
"step": 3270
},
{
"epoch": 10.154798761609907,
"grad_norm": 1.65625,
"learning_rate": 0.0001,
"loss": 0.0334,
"step": 3280
},
{
"epoch": 10.185758513931889,
"grad_norm": 0.875,
"learning_rate": 0.0001,
"loss": 0.0425,
"step": 3290
},
{
"epoch": 10.21671826625387,
"grad_norm": 0.734375,
"learning_rate": 0.0001,
"loss": 0.035,
"step": 3300
},
{
"epoch": 10.24767801857585,
"grad_norm": 1.15625,
"learning_rate": 0.0001,
"loss": 0.0222,
"step": 3310
},
{
"epoch": 10.278637770897832,
"grad_norm": 0.578125,
"learning_rate": 0.0001,
"loss": 0.0189,
"step": 3320
},
{
"epoch": 10.309597523219814,
"grad_norm": 0.059326171875,
"learning_rate": 0.0001,
"loss": 0.0338,
"step": 3330
},
{
"epoch": 10.340557275541796,
"grad_norm": 0.71875,
"learning_rate": 0.0001,
"loss": 0.0481,
"step": 3340
},
{
"epoch": 10.371517027863778,
"grad_norm": 1.234375,
"learning_rate": 0.0001,
"loss": 0.0405,
"step": 3350
},
{
"epoch": 10.402476780185758,
"grad_norm": 0.9609375,
"learning_rate": 0.0001,
"loss": 0.0288,
"step": 3360
},
{
"epoch": 10.43343653250774,
"grad_norm": 0.98046875,
"learning_rate": 0.0001,
"loss": 0.0322,
"step": 3370
},
{
"epoch": 10.464396284829721,
"grad_norm": 1.1875,
"learning_rate": 0.0001,
"loss": 0.0249,
"step": 3380
},
{
"epoch": 10.495356037151703,
"grad_norm": 1.171875,
"learning_rate": 0.0001,
"loss": 0.0556,
"step": 3390
},
{
"epoch": 10.526315789473685,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 0.038,
"step": 3400
},
{
"epoch": 10.557275541795665,
"grad_norm": 0.87890625,
"learning_rate": 0.0001,
"loss": 0.0356,
"step": 3410
},
{
"epoch": 10.588235294117647,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 0.0347,
"step": 3420
},
{
"epoch": 10.619195046439629,
"grad_norm": 2.46875,
"learning_rate": 0.0001,
"loss": 0.0297,
"step": 3430
},
{
"epoch": 10.65015479876161,
"grad_norm": 0.94140625,
"learning_rate": 0.0001,
"loss": 0.0537,
"step": 3440
},
{
"epoch": 10.681114551083592,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 0.04,
"step": 3450
},
{
"epoch": 10.712074303405572,
"grad_norm": 0.70703125,
"learning_rate": 0.0001,
"loss": 0.031,
"step": 3460
},
{
"epoch": 10.743034055727554,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 0.0355,
"step": 3470
},
{
"epoch": 10.773993808049536,
"grad_norm": 1.1953125,
"learning_rate": 0.0001,
"loss": 0.0308,
"step": 3480
},
{
"epoch": 10.804953560371517,
"grad_norm": 0.96484375,
"learning_rate": 0.0001,
"loss": 0.051,
"step": 3490
},
{
"epoch": 10.8359133126935,
"grad_norm": 1.3828125,
"learning_rate": 0.0001,
"loss": 0.0372,
"step": 3500
},
{
"epoch": 10.86687306501548,
"grad_norm": 0.6875,
"learning_rate": 0.0001,
"loss": 0.0272,
"step": 3510
},
{
"epoch": 10.897832817337461,
"grad_norm": 1.03125,
"learning_rate": 0.0001,
"loss": 0.0314,
"step": 3520
},
{
"epoch": 10.928792569659443,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 0.0278,
"step": 3530
},
{
"epoch": 10.959752321981425,
"grad_norm": 0.84765625,
"learning_rate": 0.0001,
"loss": 0.0579,
"step": 3540
},
{
"epoch": 10.990712074303406,
"grad_norm": 0.9921875,
"learning_rate": 0.0001,
"loss": 0.0348,
"step": 3550
},
{
"epoch": 11.021671826625386,
"grad_norm": 0.5546875,
"learning_rate": 0.0001,
"loss": 0.0299,
"step": 3560
},
{
"epoch": 11.052631578947368,
"grad_norm": 0.95703125,
"learning_rate": 0.0001,
"loss": 0.0307,
"step": 3570
},
{
"epoch": 11.08359133126935,
"grad_norm": 0.86328125,
"learning_rate": 0.0001,
"loss": 0.0273,
"step": 3580
},
{
"epoch": 11.114551083591332,
"grad_norm": 1.1796875,
"learning_rate": 0.0001,
"loss": 0.0251,
"step": 3590
},
{
"epoch": 11.145510835913313,
"grad_norm": 0.271484375,
"learning_rate": 0.0001,
"loss": 0.0149,
"step": 3600
},
{
"epoch": 11.176470588235293,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 0.0327,
"step": 3610
},
{
"epoch": 11.207430340557275,
"grad_norm": 0.796875,
"learning_rate": 0.0001,
"loss": 0.0271,
"step": 3620
},
{
"epoch": 11.238390092879257,
"grad_norm": 1.390625,
"learning_rate": 0.0001,
"loss": 0.0276,
"step": 3630
},
{
"epoch": 11.269349845201239,
"grad_norm": 0.498046875,
"learning_rate": 0.0001,
"loss": 0.0228,
"step": 3640
},
{
"epoch": 11.30030959752322,
"grad_norm": 0.2109375,
"learning_rate": 0.0001,
"loss": 0.0226,
"step": 3650
},
{
"epoch": 11.3312693498452,
"grad_norm": 0.83203125,
"learning_rate": 0.0001,
"loss": 0.0352,
"step": 3660
},
{
"epoch": 11.362229102167182,
"grad_norm": 0.6640625,
"learning_rate": 0.0001,
"loss": 0.0277,
"step": 3670
},
{
"epoch": 11.393188854489164,
"grad_norm": 1.1875,
"learning_rate": 0.0001,
"loss": 0.0296,
"step": 3680
},
{
"epoch": 11.424148606811146,
"grad_norm": 0.5703125,
"learning_rate": 0.0001,
"loss": 0.0221,
"step": 3690
},
{
"epoch": 11.455108359133128,
"grad_norm": 0.78125,
"learning_rate": 0.0001,
"loss": 0.0241,
"step": 3700
},
{
"epoch": 11.486068111455108,
"grad_norm": 0.9921875,
"learning_rate": 0.0001,
"loss": 0.0459,
"step": 3710
},
{
"epoch": 11.51702786377709,
"grad_norm": 1.0390625,
"learning_rate": 0.0001,
"loss": 0.0358,
"step": 3720
},
{
"epoch": 11.547987616099071,
"grad_norm": 0.8125,
"learning_rate": 0.0001,
"loss": 0.0294,
"step": 3730
},
{
"epoch": 11.578947368421053,
"grad_norm": 1.7890625,
"learning_rate": 0.0001,
"loss": 0.0359,
"step": 3740
},
{
"epoch": 11.609907120743035,
"grad_norm": 0.5078125,
"learning_rate": 0.0001,
"loss": 0.0281,
"step": 3750
},
{
"epoch": 11.640866873065015,
"grad_norm": 0.9375,
"learning_rate": 0.0001,
"loss": 0.0449,
"step": 3760
},
{
"epoch": 11.671826625386997,
"grad_norm": 1.046875,
"learning_rate": 0.0001,
"loss": 0.0386,
"step": 3770
},
{
"epoch": 11.702786377708978,
"grad_norm": 0.74609375,
"learning_rate": 0.0001,
"loss": 0.042,
"step": 3780
},
{
"epoch": 11.73374613003096,
"grad_norm": 0.82421875,
"learning_rate": 0.0001,
"loss": 0.0182,
"step": 3790
},
{
"epoch": 11.764705882352942,
"grad_norm": 1.4375,
"learning_rate": 0.0001,
"loss": 0.0254,
"step": 3800
},
{
"epoch": 11.795665634674922,
"grad_norm": 0.703125,
"learning_rate": 0.0001,
"loss": 0.054,
"step": 3810
},
{
"epoch": 11.826625386996904,
"grad_norm": 0.671875,
"learning_rate": 0.0001,
"loss": 0.0378,
"step": 3820
},
{
"epoch": 11.857585139318886,
"grad_norm": 1.078125,
"learning_rate": 0.0001,
"loss": 0.0355,
"step": 3830
},
{
"epoch": 11.888544891640867,
"grad_norm": 0.9765625,
"learning_rate": 0.0001,
"loss": 0.0251,
"step": 3840
},
{
"epoch": 11.91950464396285,
"grad_norm": 0.40234375,
"learning_rate": 0.0001,
"loss": 0.027,
"step": 3850
},
{
"epoch": 11.95046439628483,
"grad_norm": 0.83984375,
"learning_rate": 0.0001,
"loss": 0.0408,
"step": 3860
},
{
"epoch": 11.981424148606811,
"grad_norm": 0.9453125,
"learning_rate": 0.0001,
"loss": 0.0295,
"step": 3870
},
{
"epoch": 12.012383900928793,
"grad_norm": 0.8515625,
"learning_rate": 0.0001,
"loss": 0.0388,
"step": 3880
},
{
"epoch": 12.043343653250774,
"grad_norm": 1.28125,
"learning_rate": 0.0001,
"loss": 0.026,
"step": 3890
},
{
"epoch": 12.074303405572756,
"grad_norm": 0.88671875,
"learning_rate": 0.0001,
"loss": 0.0246,
"step": 3900
},
{
"epoch": 12.105263157894736,
"grad_norm": 1.4296875,
"learning_rate": 0.0001,
"loss": 0.0277,
"step": 3910
},
{
"epoch": 12.136222910216718,
"grad_norm": 0.376953125,
"learning_rate": 0.0001,
"loss": 0.0215,
"step": 3920
},
{
"epoch": 12.1671826625387,
"grad_norm": 0.83984375,
"learning_rate": 0.0001,
"loss": 0.039,
"step": 3930
},
{
"epoch": 12.198142414860682,
"grad_norm": 0.75390625,
"learning_rate": 0.0001,
"loss": 0.0321,
"step": 3940
},
{
"epoch": 12.229102167182663,
"grad_norm": 0.7734375,
"learning_rate": 0.0001,
"loss": 0.0201,
"step": 3950
},
{
"epoch": 12.260061919504643,
"grad_norm": 0.65625,
"learning_rate": 0.0001,
"loss": 0.0222,
"step": 3960
},
{
"epoch": 12.291021671826625,
"grad_norm": 1.0234375,
"learning_rate": 0.0001,
"loss": 0.0205,
"step": 3970
},
{
"epoch": 12.321981424148607,
"grad_norm": 0.63671875,
"learning_rate": 0.0001,
"loss": 0.0223,
"step": 3980
},
{
"epoch": 12.352941176470589,
"grad_norm": 0.447265625,
"learning_rate": 0.0001,
"loss": 0.0308,
"step": 3990
},
{
"epoch": 12.38390092879257,
"grad_norm": 0.8046875,
"learning_rate": 0.0001,
"loss": 0.0247,
"step": 4000
},
{
"epoch": 12.41486068111455,
"grad_norm": 1.2421875,
"learning_rate": 0.0001,
"loss": 0.0173,
"step": 4010
},
{
"epoch": 12.445820433436532,
"grad_norm": 1.0078125,
"learning_rate": 0.0001,
"loss": 0.0118,
"step": 4020
},
{
"epoch": 12.476780185758514,
"grad_norm": 1.046875,
"learning_rate": 0.0001,
"loss": 0.0264,
"step": 4030
},
{
"epoch": 12.507739938080496,
"grad_norm": 0.96875,
"learning_rate": 0.0001,
"loss": 0.0324,
"step": 4040
},
{
"epoch": 12.538699690402478,
"grad_norm": 0.82421875,
"learning_rate": 0.0001,
"loss": 0.0231,
"step": 4050
},
{
"epoch": 12.569659442724458,
"grad_norm": 0.7890625,
"learning_rate": 0.0001,
"loss": 0.0173,
"step": 4060
},
{
"epoch": 12.60061919504644,
"grad_norm": 0.7265625,
"learning_rate": 0.0001,
"loss": 0.0322,
"step": 4070
},
{
"epoch": 12.631578947368421,
"grad_norm": 0.78125,
"learning_rate": 0.0001,
"loss": 0.03,
"step": 4080
},
{
"epoch": 12.662538699690403,
"grad_norm": 0.87109375,
"learning_rate": 0.0001,
"loss": 0.0317,
"step": 4090
},
{
"epoch": 12.693498452012385,
"grad_norm": 0.87890625,
"learning_rate": 0.0001,
"loss": 0.0313,
"step": 4100
},
{
"epoch": 12.724458204334365,
"grad_norm": 1.390625,
"learning_rate": 0.0001,
"loss": 0.023,
"step": 4110
},
{
"epoch": 12.755417956656347,
"grad_norm": 1.125,
"learning_rate": 0.0001,
"loss": 0.0227,
"step": 4120
},
{
"epoch": 12.786377708978328,
"grad_norm": 0.8828125,
"learning_rate": 0.0001,
"loss": 0.0307,
"step": 4130
},
{
"epoch": 12.81733746130031,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 0.0354,
"step": 4140
},
{
"epoch": 12.848297213622292,
"grad_norm": 1.125,
"learning_rate": 0.0001,
"loss": 0.0306,
"step": 4150
},
{
"epoch": 12.879256965944272,
"grad_norm": 1.40625,
"learning_rate": 0.0001,
"loss": 0.0267,
"step": 4160
},
{
"epoch": 12.910216718266254,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 0.0213,
"step": 4170
},
{
"epoch": 12.941176470588236,
"grad_norm": 0.9140625,
"learning_rate": 0.0001,
"loss": 0.0452,
"step": 4180
},
{
"epoch": 12.972136222910217,
"grad_norm": 1.0703125,
"learning_rate": 0.0001,
"loss": 0.0388,
"step": 4190
},
{
"epoch": 13.003095975232197,
"grad_norm": 0.48828125,
"learning_rate": 0.0001,
"loss": 0.0284,
"step": 4200
},
{
"epoch": 13.034055727554179,
"grad_norm": 0.859375,
"learning_rate": 0.0001,
"loss": 0.0331,
"step": 4210
},
{
"epoch": 13.06501547987616,
"grad_norm": 0.71875,
"learning_rate": 0.0001,
"loss": 0.0238,
"step": 4220
},
{
"epoch": 13.095975232198143,
"grad_norm": 0.81640625,
"learning_rate": 0.0001,
"loss": 0.0177,
"step": 4230
},
{
"epoch": 13.126934984520124,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 0.0205,
"step": 4240
},
{
"epoch": 13.157894736842104,
"grad_norm": 0.55078125,
"learning_rate": 0.0001,
"loss": 0.0352,
"step": 4250
},
{
"epoch": 13.188854489164086,
"grad_norm": 0.52734375,
"learning_rate": 0.0001,
"loss": 0.0358,
"step": 4260
},
{
"epoch": 13.219814241486068,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 0.0285,
"step": 4270
},
{
"epoch": 13.25077399380805,
"grad_norm": 0.9609375,
"learning_rate": 0.0001,
"loss": 0.0207,
"step": 4280
},
{
"epoch": 13.281733746130032,
"grad_norm": 1.5625,
"learning_rate": 0.0001,
"loss": 0.0222,
"step": 4290
},
{
"epoch": 13.312693498452012,
"grad_norm": 0.8671875,
"learning_rate": 0.0001,
"loss": 0.0234,
"step": 4300
},
{
"epoch": 13.343653250773993,
"grad_norm": 0.984375,
"learning_rate": 0.0001,
"loss": 0.0337,
"step": 4310
},
{
"epoch": 13.374613003095975,
"grad_norm": 0.427734375,
"learning_rate": 0.0001,
"loss": 0.0259,
"step": 4320
},
{
"epoch": 13.405572755417957,
"grad_norm": 1.140625,
"learning_rate": 0.0001,
"loss": 0.0281,
"step": 4330
},
{
"epoch": 13.436532507739939,
"grad_norm": 0.9140625,
"learning_rate": 0.0001,
"loss": 0.0242,
"step": 4340
},
{
"epoch": 13.467492260061919,
"grad_norm": 0.671875,
"learning_rate": 0.0001,
"loss": 0.0456,
"step": 4350
},
{
"epoch": 13.4984520123839,
"grad_norm": 0.73828125,
"learning_rate": 0.0001,
"loss": 0.0372,
"step": 4360
},
{
"epoch": 13.529411764705882,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 0.027,
"step": 4370
},
{
"epoch": 13.560371517027864,
"grad_norm": 0.84765625,
"learning_rate": 0.0001,
"loss": 0.0266,
"step": 4380
},
{
"epoch": 13.591331269349846,
"grad_norm": 0.73046875,
"learning_rate": 0.0001,
"loss": 0.0203,
"step": 4390
},
{
"epoch": 13.622291021671826,
"grad_norm": 0.828125,
"learning_rate": 0.0001,
"loss": 0.0204,
"step": 4400
},
{
"epoch": 13.653250773993808,
"grad_norm": 0.7265625,
"learning_rate": 0.0001,
"loss": 0.0458,
"step": 4410
},
{
"epoch": 13.68421052631579,
"grad_norm": 0.45703125,
"learning_rate": 0.0001,
"loss": 0.0324,
"step": 4420
},
{
"epoch": 13.715170278637771,
"grad_norm": 1.0,
"learning_rate": 0.0001,
"loss": 0.0279,
"step": 4430
},
{
"epoch": 13.746130030959753,
"grad_norm": 1.21875,
"learning_rate": 0.0001,
"loss": 0.0281,
"step": 4440
},
{
"epoch": 13.777089783281733,
"grad_norm": 1.0078125,
"learning_rate": 0.0001,
"loss": 0.0189,
"step": 4450
},
{
"epoch": 13.808049535603715,
"grad_norm": 0.70703125,
"learning_rate": 0.0001,
"loss": 0.0461,
"step": 4460
},
{
"epoch": 13.839009287925697,
"grad_norm": 1.2578125,
"learning_rate": 0.0001,
"loss": 0.0346,
"step": 4470
},
{
"epoch": 13.869969040247678,
"grad_norm": 1.015625,
"learning_rate": 0.0001,
"loss": 0.0338,
"step": 4480
},
{
"epoch": 13.90092879256966,
"grad_norm": 1.4375,
"learning_rate": 0.0001,
"loss": 0.0323,
"step": 4490
},
{
"epoch": 13.93188854489164,
"grad_norm": 0.69140625,
"learning_rate": 0.0001,
"loss": 0.0363,
"step": 4500
},
{
"epoch": 13.962848297213622,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 0.0411,
"step": 4510
},
{
"epoch": 13.993808049535604,
"grad_norm": 2.09375,
"learning_rate": 0.0001,
"loss": 0.0235,
"step": 4520
},
{
"epoch": 14.024767801857585,
"grad_norm": 1.0859375,
"learning_rate": 0.0001,
"loss": 0.028,
"step": 4530
},
{
"epoch": 14.055727554179567,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 0.0288,
"step": 4540
},
{
"epoch": 14.086687306501547,
"grad_norm": 1.015625,
"learning_rate": 0.0001,
"loss": 0.0286,
"step": 4550
},
{
"epoch": 14.117647058823529,
"grad_norm": 0.5234375,
"learning_rate": 0.0001,
"loss": 0.0253,
"step": 4560
},
{
"epoch": 14.14860681114551,
"grad_norm": 1.171875,
"learning_rate": 0.0001,
"loss": 0.0244,
"step": 4570
},
{
"epoch": 14.179566563467493,
"grad_norm": 0.7578125,
"learning_rate": 0.0001,
"loss": 0.0405,
"step": 4580
},
{
"epoch": 14.210526315789474,
"grad_norm": 0.6796875,
"learning_rate": 0.0001,
"loss": 0.0255,
"step": 4590
},
{
"epoch": 14.241486068111454,
"grad_norm": 1.515625,
"learning_rate": 0.0001,
"loss": 0.0242,
"step": 4600
},
{
"epoch": 14.272445820433436,
"grad_norm": 1.3984375,
"learning_rate": 0.0001,
"loss": 0.0304,
"step": 4610
},
{
"epoch": 14.303405572755418,
"grad_norm": 1.015625,
"learning_rate": 0.0001,
"loss": 0.0194,
"step": 4620
},
{
"epoch": 14.3343653250774,
"grad_norm": 0.51171875,
"learning_rate": 0.0001,
"loss": 0.033,
"step": 4630
},
{
"epoch": 14.365325077399381,
"grad_norm": 0.69921875,
"learning_rate": 0.0001,
"loss": 0.032,
"step": 4640
},
{
"epoch": 14.396284829721361,
"grad_norm": 0.74609375,
"learning_rate": 0.0001,
"loss": 0.0223,
"step": 4650
},
{
"epoch": 14.427244582043343,
"grad_norm": 1.171875,
"learning_rate": 0.0001,
"loss": 0.0294,
"step": 4660
},
{
"epoch": 14.458204334365325,
"grad_norm": 1.421875,
"learning_rate": 0.0001,
"loss": 0.0339,
"step": 4670
},
{
"epoch": 14.489164086687307,
"grad_norm": 0.890625,
"learning_rate": 0.0001,
"loss": 0.0364,
"step": 4680
},
{
"epoch": 14.520123839009289,
"grad_norm": 0.396484375,
"learning_rate": 0.0001,
"loss": 0.0268,
"step": 4690
},
{
"epoch": 14.551083591331269,
"grad_norm": 1.2109375,
"learning_rate": 0.0001,
"loss": 0.0193,
"step": 4700
},
{
"epoch": 14.58204334365325,
"grad_norm": 0.15625,
"learning_rate": 0.0001,
"loss": 0.0213,
"step": 4710
},
{
"epoch": 14.613003095975232,
"grad_norm": 0.63671875,
"learning_rate": 0.0001,
"loss": 0.0141,
"step": 4720
},
{
"epoch": 14.643962848297214,
"grad_norm": 0.81640625,
"learning_rate": 0.0001,
"loss": 0.037,
"step": 4730
},
{
"epoch": 14.674922600619196,
"grad_norm": 0.7578125,
"learning_rate": 0.0001,
"loss": 0.033,
"step": 4740
},
{
"epoch": 14.705882352941176,
"grad_norm": 0.92578125,
"learning_rate": 0.0001,
"loss": 0.0237,
"step": 4750
},
{
"epoch": 14.736842105263158,
"grad_norm": 1.2734375,
"learning_rate": 0.0001,
"loss": 0.0277,
"step": 4760
},
{
"epoch": 14.76780185758514,
"grad_norm": 1.59375,
"learning_rate": 0.0001,
"loss": 0.024,
"step": 4770
},
{
"epoch": 14.798761609907121,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 0.0478,
"step": 4780
},
{
"epoch": 14.829721362229103,
"grad_norm": 1.078125,
"learning_rate": 0.0001,
"loss": 0.0321,
"step": 4790
},
{
"epoch": 14.860681114551083,
"grad_norm": 0.50390625,
"learning_rate": 0.0001,
"loss": 0.0309,
"step": 4800
},
{
"epoch": 14.891640866873065,
"grad_norm": 0.546875,
"learning_rate": 0.0001,
"loss": 0.026,
"step": 4810
},
{
"epoch": 14.922600619195046,
"grad_norm": 0.796875,
"learning_rate": 0.0001,
"loss": 0.0256,
"step": 4820
},
{
"epoch": 14.953560371517028,
"grad_norm": 1.2421875,
"learning_rate": 0.0001,
"loss": 0.0351,
"step": 4830
},
{
"epoch": 14.98452012383901,
"grad_norm": 0.4375,
"learning_rate": 0.0001,
"loss": 0.0329,
"step": 4840
},
{
"epoch": 15.01547987616099,
"grad_norm": 0.6015625,
"learning_rate": 0.0001,
"loss": 0.0286,
"step": 4850
},
{
"epoch": 15.046439628482972,
"grad_norm": 1.34375,
"learning_rate": 0.0001,
"loss": 0.0301,
"step": 4860
},
{
"epoch": 15.077399380804954,
"grad_norm": 0.99609375,
"learning_rate": 0.0001,
"loss": 0.0247,
"step": 4870
},
{
"epoch": 15.108359133126935,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 0.0237,
"step": 4880
},
{
"epoch": 15.139318885448917,
"grad_norm": 0.87890625,
"learning_rate": 0.0001,
"loss": 0.0248,
"step": 4890
},
{
"epoch": 15.170278637770897,
"grad_norm": 0.474609375,
"learning_rate": 0.0001,
"loss": 0.0246,
"step": 4900
},
{
"epoch": 15.201238390092879,
"grad_norm": 0.73828125,
"learning_rate": 0.0001,
"loss": 0.0234,
"step": 4910
},
{
"epoch": 15.23219814241486,
"grad_norm": 1.4453125,
"learning_rate": 0.0001,
"loss": 0.0181,
"step": 4920
},
{
"epoch": 15.263157894736842,
"grad_norm": 1.15625,
"learning_rate": 0.0001,
"loss": 0.0288,
"step": 4930
},
{
"epoch": 15.294117647058824,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 0.0234,
"step": 4940
},
{
"epoch": 15.325077399380804,
"grad_norm": 0.69140625,
"learning_rate": 0.0001,
"loss": 0.0323,
"step": 4950
},
{
"epoch": 15.356037151702786,
"grad_norm": 1.421875,
"learning_rate": 0.0001,
"loss": 0.0296,
"step": 4960
},
{
"epoch": 15.386996904024768,
"grad_norm": 0.83203125,
"learning_rate": 0.0001,
"loss": 0.0267,
"step": 4970
},
{
"epoch": 15.41795665634675,
"grad_norm": 0.498046875,
"learning_rate": 0.0001,
"loss": 0.0256,
"step": 4980
},
{
"epoch": 15.448916408668731,
"grad_norm": 0.6640625,
"learning_rate": 0.0001,
"loss": 0.0307,
"step": 4990
},
{
"epoch": 15.479876160990711,
"grad_norm": 0.9296875,
"learning_rate": 0.0001,
"loss": 0.0381,
"step": 5000
},
{
"epoch": 15.510835913312693,
"grad_norm": 0.326171875,
"learning_rate": 0.0001,
"loss": 0.022,
"step": 5010
},
{
"epoch": 15.541795665634675,
"grad_norm": 1.0234375,
"learning_rate": 0.0001,
"loss": 0.0214,
"step": 5020
},
{
"epoch": 15.572755417956657,
"grad_norm": 0.58984375,
"learning_rate": 0.0001,
"loss": 0.023,
"step": 5030
},
{
"epoch": 15.603715170278639,
"grad_norm": 0.1162109375,
"learning_rate": 0.0001,
"loss": 0.0278,
"step": 5040
},
{
"epoch": 15.634674922600619,
"grad_norm": 0.74609375,
"learning_rate": 0.0001,
"loss": 0.0283,
"step": 5050
},
{
"epoch": 15.6656346749226,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 0.0256,
"step": 5060
},
{
"epoch": 15.696594427244582,
"grad_norm": 0.70703125,
"learning_rate": 0.0001,
"loss": 0.031,
"step": 5070
},
{
"epoch": 15.727554179566564,
"grad_norm": 0.404296875,
"learning_rate": 0.0001,
"loss": 0.0159,
"step": 5080
},
{
"epoch": 15.758513931888546,
"grad_norm": 1.359375,
"learning_rate": 0.0001,
"loss": 0.0241,
"step": 5090
},
{
"epoch": 15.789473684210526,
"grad_norm": 0.890625,
"learning_rate": 0.0001,
"loss": 0.0371,
"step": 5100
},
{
"epoch": 15.820433436532507,
"grad_norm": 0.67578125,
"learning_rate": 0.0001,
"loss": 0.0257,
"step": 5110
},
{
"epoch": 15.85139318885449,
"grad_norm": 0.75390625,
"learning_rate": 0.0001,
"loss": 0.0325,
"step": 5120
},
{
"epoch": 15.882352941176471,
"grad_norm": 1.25,
"learning_rate": 0.0001,
"loss": 0.0235,
"step": 5130
},
{
"epoch": 15.913312693498453,
"grad_norm": 0.79296875,
"learning_rate": 0.0001,
"loss": 0.0241,
"step": 5140
},
{
"epoch": 15.944272445820433,
"grad_norm": 0.8125,
"learning_rate": 0.0001,
"loss": 0.0271,
"step": 5150
},
{
"epoch": 15.975232198142415,
"grad_norm": 0.875,
"learning_rate": 0.0001,
"loss": 0.0311,
"step": 5160
},
{
"epoch": 16.006191950464395,
"grad_norm": 1.0703125,
"learning_rate": 0.0001,
"loss": 0.0263,
"step": 5170
},
{
"epoch": 16.037151702786378,
"grad_norm": 0.7421875,
"learning_rate": 0.0001,
"loss": 0.0198,
"step": 5180
},
{
"epoch": 16.068111455108358,
"grad_norm": 0.7578125,
"learning_rate": 0.0001,
"loss": 0.0188,
"step": 5190
},
{
"epoch": 16.09907120743034,
"grad_norm": 1.0703125,
"learning_rate": 0.0001,
"loss": 0.0174,
"step": 5200
},
{
"epoch": 16.13003095975232,
"grad_norm": 0.61328125,
"learning_rate": 0.0001,
"loss": 0.022,
"step": 5210
},
{
"epoch": 16.1609907120743,
"grad_norm": 0.5859375,
"learning_rate": 0.0001,
"loss": 0.0216,
"step": 5220
},
{
"epoch": 16.191950464396285,
"grad_norm": 0.54296875,
"learning_rate": 0.0001,
"loss": 0.0271,
"step": 5230
},
{
"epoch": 16.222910216718265,
"grad_norm": 0.87109375,
"learning_rate": 0.0001,
"loss": 0.0227,
"step": 5240
},
{
"epoch": 16.25386996904025,
"grad_norm": 0.1513671875,
"learning_rate": 0.0001,
"loss": 0.0143,
"step": 5250
},
{
"epoch": 16.28482972136223,
"grad_norm": 0.80859375,
"learning_rate": 0.0001,
"loss": 0.0261,
"step": 5260
},
{
"epoch": 16.31578947368421,
"grad_norm": 0.8203125,
"learning_rate": 0.0001,
"loss": 0.0168,
"step": 5270
},
{
"epoch": 16.346749226006192,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 0.0272,
"step": 5280
},
{
"epoch": 16.377708978328172,
"grad_norm": 0.91015625,
"learning_rate": 0.0001,
"loss": 0.0272,
"step": 5290
},
{
"epoch": 16.408668730650156,
"grad_norm": 0.44140625,
"learning_rate": 0.0001,
"loss": 0.0264,
"step": 5300
},
{
"epoch": 16.439628482972136,
"grad_norm": 0.1796875,
"learning_rate": 0.0001,
"loss": 0.022,
"step": 5310
},
{
"epoch": 16.470588235294116,
"grad_norm": 0.6953125,
"learning_rate": 0.0001,
"loss": 0.0292,
"step": 5320
},
{
"epoch": 16.5015479876161,
"grad_norm": 0.7265625,
"learning_rate": 0.0001,
"loss": 0.0317,
"step": 5330
},
{
"epoch": 16.53250773993808,
"grad_norm": 1.1640625,
"learning_rate": 0.0001,
"loss": 0.025,
"step": 5340
},
{
"epoch": 16.563467492260063,
"grad_norm": 1.2109375,
"learning_rate": 0.0001,
"loss": 0.0223,
"step": 5350
},
{
"epoch": 16.594427244582043,
"grad_norm": 0.5390625,
"learning_rate": 0.0001,
"loss": 0.0171,
"step": 5360
},
{
"epoch": 16.625386996904023,
"grad_norm": 0.82421875,
"learning_rate": 0.0001,
"loss": 0.028,
"step": 5370
},
{
"epoch": 16.656346749226007,
"grad_norm": 0.7421875,
"learning_rate": 0.0001,
"loss": 0.0324,
"step": 5380
},
{
"epoch": 16.687306501547987,
"grad_norm": 0.7734375,
"learning_rate": 0.0001,
"loss": 0.0209,
"step": 5390
},
{
"epoch": 16.71826625386997,
"grad_norm": 0.8203125,
"learning_rate": 0.0001,
"loss": 0.0192,
"step": 5400
},
{
"epoch": 16.74922600619195,
"grad_norm": 0.87109375,
"learning_rate": 0.0001,
"loss": 0.0289,
"step": 5410
},
{
"epoch": 16.78018575851393,
"grad_norm": 0.8828125,
"learning_rate": 0.0001,
"loss": 0.0263,
"step": 5420
},
{
"epoch": 16.811145510835914,
"grad_norm": 0.73828125,
"learning_rate": 0.0001,
"loss": 0.0348,
"step": 5430
},
{
"epoch": 16.842105263157894,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 0.023,
"step": 5440
},
{
"epoch": 16.873065015479877,
"grad_norm": 0.9140625,
"learning_rate": 0.0001,
"loss": 0.0192,
"step": 5450
},
{
"epoch": 16.904024767801857,
"grad_norm": 1.9921875,
"learning_rate": 0.0001,
"loss": 0.0272,
"step": 5460
},
{
"epoch": 16.934984520123837,
"grad_norm": 1.015625,
"learning_rate": 0.0001,
"loss": 0.0254,
"step": 5470
},
{
"epoch": 16.96594427244582,
"grad_norm": 1.4375,
"learning_rate": 0.0001,
"loss": 0.0399,
"step": 5480
},
{
"epoch": 16.9969040247678,
"grad_norm": 1.3984375,
"learning_rate": 0.0001,
"loss": 0.0345,
"step": 5490
},
{
"epoch": 17.027863777089784,
"grad_norm": 0.96484375,
"learning_rate": 0.0001,
"loss": 0.0233,
"step": 5500
},
{
"epoch": 17.058823529411764,
"grad_norm": 0.80078125,
"learning_rate": 0.0001,
"loss": 0.0205,
"step": 5510
},
{
"epoch": 17.089783281733745,
"grad_norm": 0.27734375,
"learning_rate": 0.0001,
"loss": 0.0282,
"step": 5520
},
{
"epoch": 17.120743034055728,
"grad_norm": 0.1708984375,
"learning_rate": 0.0001,
"loss": 0.0279,
"step": 5530
},
{
"epoch": 17.151702786377708,
"grad_norm": 0.32421875,
"learning_rate": 0.0001,
"loss": 0.0274,
"step": 5540
},
{
"epoch": 17.18266253869969,
"grad_norm": 0.6171875,
"learning_rate": 0.0001,
"loss": 0.0289,
"step": 5550
},
{
"epoch": 17.21362229102167,
"grad_norm": 0.6953125,
"learning_rate": 0.0001,
"loss": 0.0235,
"step": 5560
},
{
"epoch": 17.24458204334365,
"grad_norm": 1.140625,
"learning_rate": 0.0001,
"loss": 0.0236,
"step": 5570
},
{
"epoch": 17.275541795665635,
"grad_norm": 1.3359375,
"learning_rate": 0.0001,
"loss": 0.0229,
"step": 5580
},
{
"epoch": 17.306501547987615,
"grad_norm": 1.7734375,
"learning_rate": 0.0001,
"loss": 0.027,
"step": 5590
},
{
"epoch": 17.3374613003096,
"grad_norm": 0.494140625,
"learning_rate": 0.0001,
"loss": 0.0254,
"step": 5600
},
{
"epoch": 17.36842105263158,
"grad_norm": 0.90234375,
"learning_rate": 0.0001,
"loss": 0.0232,
"step": 5610
},
{
"epoch": 17.39938080495356,
"grad_norm": 0.71484375,
"learning_rate": 0.0001,
"loss": 0.0189,
"step": 5620
},
{
"epoch": 17.430340557275542,
"grad_norm": 0.62890625,
"learning_rate": 0.0001,
"loss": 0.0262,
"step": 5630
},
{
"epoch": 17.461300309597522,
"grad_norm": 1.1796875,
"learning_rate": 0.0001,
"loss": 0.0114,
"step": 5640
},
{
"epoch": 17.492260061919506,
"grad_norm": 0.72265625,
"learning_rate": 0.0001,
"loss": 0.0345,
"step": 5650
},
{
"epoch": 17.523219814241486,
"grad_norm": 0.765625,
"learning_rate": 0.0001,
"loss": 0.0289,
"step": 5660
},
{
"epoch": 17.554179566563466,
"grad_norm": 0.73828125,
"learning_rate": 0.0001,
"loss": 0.0186,
"step": 5670
},
{
"epoch": 17.58513931888545,
"grad_norm": 0.52734375,
"learning_rate": 0.0001,
"loss": 0.0317,
"step": 5680
},
{
"epoch": 17.61609907120743,
"grad_norm": 0.25390625,
"learning_rate": 0.0001,
"loss": 0.0181,
"step": 5690
},
{
"epoch": 17.647058823529413,
"grad_norm": 0.7578125,
"learning_rate": 0.0001,
"loss": 0.0364,
"step": 5700
},
{
"epoch": 17.678018575851393,
"grad_norm": 0.81640625,
"learning_rate": 0.0001,
"loss": 0.0288,
"step": 5710
},
{
"epoch": 17.708978328173373,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 0.0215,
"step": 5720
},
{
"epoch": 17.739938080495357,
"grad_norm": 2.078125,
"learning_rate": 0.0001,
"loss": 0.0232,
"step": 5730
},
{
"epoch": 17.770897832817337,
"grad_norm": 1.7578125,
"learning_rate": 0.0001,
"loss": 0.0192,
"step": 5740
},
{
"epoch": 17.80185758513932,
"grad_norm": 0.875,
"learning_rate": 0.0001,
"loss": 0.0388,
"step": 5750
},
{
"epoch": 17.8328173374613,
"grad_norm": 1.125,
"learning_rate": 0.0001,
"loss": 0.0225,
"step": 5760
},
{
"epoch": 17.86377708978328,
"grad_norm": 1.1640625,
"learning_rate": 0.0001,
"loss": 0.0256,
"step": 5770
},
{
"epoch": 17.894736842105264,
"grad_norm": 0.84375,
"learning_rate": 0.0001,
"loss": 0.0192,
"step": 5780
},
{
"epoch": 17.925696594427244,
"grad_norm": 1.375,
"learning_rate": 0.0001,
"loss": 0.0354,
"step": 5790
},
{
"epoch": 17.956656346749227,
"grad_norm": 0.56640625,
"learning_rate": 0.0001,
"loss": 0.0361,
"step": 5800
},
{
"epoch": 17.987616099071207,
"grad_norm": 0.62109375,
"learning_rate": 0.0001,
"loss": 0.0357,
"step": 5810
},
{
"epoch": 18.018575851393187,
"grad_norm": 0.7734375,
"learning_rate": 0.0001,
"loss": 0.0196,
"step": 5820
},
{
"epoch": 18.04953560371517,
"grad_norm": 0.55859375,
"learning_rate": 0.0001,
"loss": 0.0231,
"step": 5830
},
{
"epoch": 18.08049535603715,
"grad_norm": 0.765625,
"learning_rate": 0.0001,
"loss": 0.0255,
"step": 5840
},
{
"epoch": 18.111455108359134,
"grad_norm": 0.91796875,
"learning_rate": 0.0001,
"loss": 0.0205,
"step": 5850
},
{
"epoch": 18.142414860681114,
"grad_norm": 0.443359375,
"learning_rate": 0.0001,
"loss": 0.0204,
"step": 5860
},
{
"epoch": 18.173374613003094,
"grad_norm": 0.87890625,
"learning_rate": 0.0001,
"loss": 0.0271,
"step": 5870
},
{
"epoch": 18.204334365325078,
"grad_norm": 0.6953125,
"learning_rate": 0.0001,
"loss": 0.0197,
"step": 5880
},
{
"epoch": 18.235294117647058,
"grad_norm": 0.419921875,
"learning_rate": 0.0001,
"loss": 0.0242,
"step": 5890
},
{
"epoch": 18.26625386996904,
"grad_norm": 1.171875,
"learning_rate": 0.0001,
"loss": 0.0176,
"step": 5900
},
{
"epoch": 18.29721362229102,
"grad_norm": 0.74609375,
"learning_rate": 0.0001,
"loss": 0.0287,
"step": 5910
},
{
"epoch": 18.328173374613,
"grad_norm": 0.7109375,
"learning_rate": 0.0001,
"loss": 0.0243,
"step": 5920
},
{
"epoch": 18.359133126934985,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 0.0255,
"step": 5930
},
{
"epoch": 18.390092879256965,
"grad_norm": 1.015625,
"learning_rate": 0.0001,
"loss": 0.0256,
"step": 5940
},
{
"epoch": 18.42105263157895,
"grad_norm": 0.90234375,
"learning_rate": 0.0001,
"loss": 0.0248,
"step": 5950
},
{
"epoch": 18.45201238390093,
"grad_norm": 1.4765625,
"learning_rate": 0.0001,
"loss": 0.0321,
"step": 5960
},
{
"epoch": 18.48297213622291,
"grad_norm": 0.6640625,
"learning_rate": 0.0001,
"loss": 0.0274,
"step": 5970
},
{
"epoch": 18.513931888544892,
"grad_norm": 0.69921875,
"learning_rate": 0.0001,
"loss": 0.0307,
"step": 5980
},
{
"epoch": 18.544891640866872,
"grad_norm": 1.046875,
"learning_rate": 0.0001,
"loss": 0.025,
"step": 5990
},
{
"epoch": 18.575851393188856,
"grad_norm": 0.388671875,
"learning_rate": 0.0001,
"loss": 0.0172,
"step": 6000
},
{
"epoch": 18.606811145510836,
"grad_norm": 0.703125,
"learning_rate": 0.0001,
"loss": 0.0103,
"step": 6010
},
{
"epoch": 18.637770897832816,
"grad_norm": 0.875,
"learning_rate": 0.0001,
"loss": 0.0398,
"step": 6020
},
{
"epoch": 18.6687306501548,
"grad_norm": 0.6953125,
"learning_rate": 0.0001,
"loss": 0.026,
"step": 6030
},
{
"epoch": 18.69969040247678,
"grad_norm": 0.12890625,
"learning_rate": 0.0001,
"loss": 0.0227,
"step": 6040
},
{
"epoch": 18.730650154798763,
"grad_norm": 0.294921875,
"learning_rate": 0.0001,
"loss": 0.0217,
"step": 6050
},
{
"epoch": 18.761609907120743,
"grad_norm": 0.51171875,
"learning_rate": 0.0001,
"loss": 0.0141,
"step": 6060
},
{
"epoch": 18.792569659442723,
"grad_norm": 0.9296875,
"learning_rate": 0.0001,
"loss": 0.0314,
"step": 6070
},
{
"epoch": 18.823529411764707,
"grad_norm": 0.4296875,
"learning_rate": 0.0001,
"loss": 0.0227,
"step": 6080
},
{
"epoch": 18.854489164086687,
"grad_norm": 0.875,
"learning_rate": 0.0001,
"loss": 0.0313,
"step": 6090
},
{
"epoch": 18.88544891640867,
"grad_norm": 0.80078125,
"learning_rate": 0.0001,
"loss": 0.0243,
"step": 6100
},
{
"epoch": 18.91640866873065,
"grad_norm": 0.06982421875,
"learning_rate": 0.0001,
"loss": 0.0211,
"step": 6110
},
{
"epoch": 18.94736842105263,
"grad_norm": 1.046875,
"learning_rate": 0.0001,
"loss": 0.0342,
"step": 6120
},
{
"epoch": 18.978328173374614,
"grad_norm": 0.46484375,
"learning_rate": 0.0001,
"loss": 0.0287,
"step": 6130
},
{
"epoch": 19.009287925696594,
"grad_norm": 0.53125,
"learning_rate": 0.0001,
"loss": 0.0293,
"step": 6140
},
{
"epoch": 19.040247678018577,
"grad_norm": 0.59375,
"learning_rate": 0.0001,
"loss": 0.0207,
"step": 6150
},
{
"epoch": 19.071207430340557,
"grad_norm": 0.35546875,
"learning_rate": 0.0001,
"loss": 0.0137,
"step": 6160
},
{
"epoch": 19.102167182662537,
"grad_norm": 0.87890625,
"learning_rate": 0.0001,
"loss": 0.0286,
"step": 6170
},
{
"epoch": 19.13312693498452,
"grad_norm": 0.2138671875,
"learning_rate": 0.0001,
"loss": 0.0153,
"step": 6180
},
{
"epoch": 19.1640866873065,
"grad_norm": 0.66015625,
"learning_rate": 0.0001,
"loss": 0.0256,
"step": 6190
},
{
"epoch": 19.195046439628484,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 0.0273,
"step": 6200
},
{
"epoch": 19.226006191950464,
"grad_norm": 1.015625,
"learning_rate": 0.0001,
"loss": 0.0226,
"step": 6210
},
{
"epoch": 19.256965944272444,
"grad_norm": 0.96484375,
"learning_rate": 0.0001,
"loss": 0.014,
"step": 6220
},
{
"epoch": 19.287925696594428,
"grad_norm": 1.1640625,
"learning_rate": 0.0001,
"loss": 0.026,
"step": 6230
},
{
"epoch": 19.318885448916408,
"grad_norm": 0.60546875,
"learning_rate": 0.0001,
"loss": 0.019,
"step": 6240
},
{
"epoch": 19.34984520123839,
"grad_norm": 0.6953125,
"learning_rate": 0.0001,
"loss": 0.0279,
"step": 6250
},
{
"epoch": 19.38080495356037,
"grad_norm": 0.6484375,
"learning_rate": 0.0001,
"loss": 0.0209,
"step": 6260
},
{
"epoch": 19.41176470588235,
"grad_norm": 0.5390625,
"learning_rate": 0.0001,
"loss": 0.0243,
"step": 6270
},
{
"epoch": 19.442724458204335,
"grad_norm": 0.5390625,
"learning_rate": 0.0001,
"loss": 0.0199,
"step": 6280
},
{
"epoch": 19.473684210526315,
"grad_norm": 0.6328125,
"learning_rate": 0.0001,
"loss": 0.0384,
"step": 6290
},
{
"epoch": 19.5046439628483,
"grad_norm": 1.0,
"learning_rate": 0.0001,
"loss": 0.026,
"step": 6300
},
{
"epoch": 19.53560371517028,
"grad_norm": 0.71875,
"learning_rate": 0.0001,
"loss": 0.0199,
"step": 6310
},
{
"epoch": 19.56656346749226,
"grad_norm": 1.2890625,
"learning_rate": 0.0001,
"loss": 0.0317,
"step": 6320
},
{
"epoch": 19.597523219814242,
"grad_norm": 0.52734375,
"learning_rate": 0.0001,
"loss": 0.0292,
"step": 6330
},
{
"epoch": 19.628482972136222,
"grad_norm": 0.77734375,
"learning_rate": 0.0001,
"loss": 0.0298,
"step": 6340
},
{
"epoch": 19.659442724458206,
"grad_norm": 0.91015625,
"learning_rate": 0.0001,
"loss": 0.0335,
"step": 6350
},
{
"epoch": 19.690402476780186,
"grad_norm": 0.3046875,
"learning_rate": 0.0001,
"loss": 0.0287,
"step": 6360
},
{
"epoch": 19.721362229102166,
"grad_norm": 0.87109375,
"learning_rate": 0.0001,
"loss": 0.0213,
"step": 6370
},
{
"epoch": 19.75232198142415,
"grad_norm": 0.48828125,
"learning_rate": 0.0001,
"loss": 0.0155,
"step": 6380
},
{
"epoch": 19.78328173374613,
"grad_norm": 1.0546875,
"learning_rate": 0.0001,
"loss": 0.0211,
"step": 6390
},
{
"epoch": 19.814241486068113,
"grad_norm": 0.609375,
"learning_rate": 0.0001,
"loss": 0.0323,
"step": 6400
},
{
"epoch": 19.845201238390093,
"grad_norm": 1.015625,
"learning_rate": 0.0001,
"loss": 0.0206,
"step": 6410
},
{
"epoch": 19.876160990712073,
"grad_norm": 1.03125,
"learning_rate": 0.0001,
"loss": 0.0151,
"step": 6420
},
{
"epoch": 19.907120743034056,
"grad_norm": 0.66015625,
"learning_rate": 0.0001,
"loss": 0.0178,
"step": 6430
},
{
"epoch": 19.938080495356036,
"grad_norm": 1.140625,
"learning_rate": 0.0001,
"loss": 0.0226,
"step": 6440
},
{
"epoch": 19.96904024767802,
"grad_norm": 1.15625,
"learning_rate": 0.0001,
"loss": 0.0291,
"step": 6450
},
{
"epoch": 20.0,
"grad_norm": 0.06396484375,
"learning_rate": 0.0001,
"loss": 0.0164,
"step": 6460
},
{
"epoch": 20.03095975232198,
"grad_norm": 0.94140625,
"learning_rate": 0.0001,
"loss": 0.02,
"step": 6470
},
{
"epoch": 20.061919504643964,
"grad_norm": 0.71875,
"learning_rate": 0.0001,
"loss": 0.0126,
"step": 6480
},
{
"epoch": 20.092879256965944,
"grad_norm": 0.416015625,
"learning_rate": 0.0001,
"loss": 0.0153,
"step": 6490
},
{
"epoch": 20.123839009287927,
"grad_norm": 1.671875,
"learning_rate": 0.0001,
"loss": 0.0186,
"step": 6500
},
{
"epoch": 20.154798761609907,
"grad_norm": 0.50390625,
"learning_rate": 0.0001,
"loss": 0.0212,
"step": 6510
},
{
"epoch": 20.185758513931887,
"grad_norm": 0.59765625,
"learning_rate": 0.0001,
"loss": 0.0213,
"step": 6520
},
{
"epoch": 20.21671826625387,
"grad_norm": 0.470703125,
"learning_rate": 0.0001,
"loss": 0.0243,
"step": 6530
},
{
"epoch": 20.24767801857585,
"grad_norm": 1.609375,
"learning_rate": 0.0001,
"loss": 0.0153,
"step": 6540
},
{
"epoch": 20.278637770897834,
"grad_norm": 0.44140625,
"learning_rate": 0.0001,
"loss": 0.0196,
"step": 6550
},
{
"epoch": 20.309597523219814,
"grad_norm": 0.318359375,
"learning_rate": 0.0001,
"loss": 0.0068,
"step": 6560
},
{
"epoch": 20.340557275541794,
"grad_norm": 0.69921875,
"learning_rate": 0.0001,
"loss": 0.0212,
"step": 6570
},
{
"epoch": 20.371517027863778,
"grad_norm": 0.71484375,
"learning_rate": 0.0001,
"loss": 0.013,
"step": 6580
},
{
"epoch": 20.402476780185758,
"grad_norm": 0.5625,
"learning_rate": 0.0001,
"loss": 0.016,
"step": 6590
},
{
"epoch": 20.43343653250774,
"grad_norm": 0.890625,
"learning_rate": 0.0001,
"loss": 0.0228,
"step": 6600
},
{
"epoch": 20.46439628482972,
"grad_norm": 0.3125,
"learning_rate": 0.0001,
"loss": 0.0068,
"step": 6610
},
{
"epoch": 20.4953560371517,
"grad_norm": 0.470703125,
"learning_rate": 0.0001,
"loss": 0.0254,
"step": 6620
},
{
"epoch": 20.526315789473685,
"grad_norm": 0.5234375,
"learning_rate": 0.0001,
"loss": 0.0173,
"step": 6630
},
{
"epoch": 20.557275541795665,
"grad_norm": 0.74609375,
"learning_rate": 0.0001,
"loss": 0.0164,
"step": 6640
},
{
"epoch": 20.58823529411765,
"grad_norm": 1.34375,
"learning_rate": 0.0001,
"loss": 0.0245,
"step": 6650
},
{
"epoch": 20.61919504643963,
"grad_norm": 0.236328125,
"learning_rate": 0.0001,
"loss": 0.0175,
"step": 6660
},
{
"epoch": 20.65015479876161,
"grad_norm": 0.671875,
"learning_rate": 0.0001,
"loss": 0.0274,
"step": 6670
},
{
"epoch": 20.681114551083592,
"grad_norm": 1.390625,
"learning_rate": 0.0001,
"loss": 0.0218,
"step": 6680
},
{
"epoch": 20.712074303405572,
"grad_norm": 1.2421875,
"learning_rate": 0.0001,
"loss": 0.0172,
"step": 6690
},
{
"epoch": 20.743034055727556,
"grad_norm": 0.7890625,
"learning_rate": 0.0001,
"loss": 0.0156,
"step": 6700
}
],
"logging_steps": 10,
"max_steps": 64600,
"num_input_tokens_seen": 0,
"num_train_epochs": 200,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.077655366024663e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}