0.3b-bara-mc4-0.8-10k / trainer_state.json
parasora's picture
Upload 8 files
e329e3b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.25438041080051527,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 0.0003999999840334324,
"loss": 8.1004,
"step": 10
},
{
"epoch": 0.0,
"learning_rate": 0.000399999936133732,
"loss": 6.2987,
"step": 20
},
{
"epoch": 0.0,
"learning_rate": 0.0003999998563009066,
"loss": 6.0792,
"step": 30
},
{
"epoch": 0.0,
"learning_rate": 0.00039999974453496884,
"loss": 5.8398,
"step": 40
},
{
"epoch": 0.0,
"learning_rate": 0.0003999996008359366,
"loss": 5.5406,
"step": 50
},
{
"epoch": 0.0,
"learning_rate": 0.0003999994252038328,
"loss": 5.2281,
"step": 60
},
{
"epoch": 0.0,
"learning_rate": 0.0003999992176386855,
"loss": 5.0488,
"step": 70
},
{
"epoch": 0.0,
"learning_rate": 0.00039999897814052787,
"loss": 4.8797,
"step": 80
},
{
"epoch": 0.0,
"learning_rate": 0.00039999870670939813,
"loss": 4.726,
"step": 90
},
{
"epoch": 0.0,
"learning_rate": 0.00039999840334533965,
"loss": 4.5608,
"step": 100
},
{
"epoch": 0.0,
"learning_rate": 0.0003999980680484007,
"loss": 4.4583,
"step": 110
},
{
"epoch": 0.0,
"learning_rate": 0.0003999977008186351,
"loss": 4.4164,
"step": 120
},
{
"epoch": 0.0,
"learning_rate": 0.0003999973016561012,
"loss": 4.3347,
"step": 130
},
{
"epoch": 0.0,
"learning_rate": 0.00039999687056086294,
"loss": 4.2288,
"step": 140
},
{
"epoch": 0.0,
"learning_rate": 0.00039999640753298903,
"loss": 4.1609,
"step": 150
},
{
"epoch": 0.0,
"learning_rate": 0.0003999959125725535,
"loss": 4.0503,
"step": 160
},
{
"epoch": 0.0,
"learning_rate": 0.00039999538567963525,
"loss": 4.0077,
"step": 170
},
{
"epoch": 0.0,
"learning_rate": 0.0003999948268543185,
"loss": 3.9593,
"step": 180
},
{
"epoch": 0.0,
"learning_rate": 0.00039999423609669247,
"loss": 3.9097,
"step": 190
},
{
"epoch": 0.01,
"learning_rate": 0.0003999936134068514,
"loss": 3.8487,
"step": 200
},
{
"epoch": 0.01,
"learning_rate": 0.0003999929587848948,
"loss": 3.7907,
"step": 210
},
{
"epoch": 0.01,
"learning_rate": 0.00039999227223092726,
"loss": 3.7483,
"step": 220
},
{
"epoch": 0.01,
"learning_rate": 0.00039999155374505826,
"loss": 3.6961,
"step": 230
},
{
"epoch": 0.01,
"learning_rate": 0.0003999908033274025,
"loss": 3.5971,
"step": 240
},
{
"epoch": 0.01,
"learning_rate": 0.0003999900209780799,
"loss": 3.594,
"step": 250
},
{
"epoch": 0.01,
"learning_rate": 0.0003999892066972154,
"loss": 3.5695,
"step": 260
},
{
"epoch": 0.01,
"learning_rate": 0.0003999883604849389,
"loss": 3.4888,
"step": 270
},
{
"epoch": 0.01,
"learning_rate": 0.0003999874823413855,
"loss": 3.4582,
"step": 280
},
{
"epoch": 0.01,
"learning_rate": 0.0003999865722666956,
"loss": 3.4195,
"step": 290
},
{
"epoch": 0.01,
"learning_rate": 0.0003999856302610142,
"loss": 3.3806,
"step": 300
},
{
"epoch": 0.01,
"learning_rate": 0.000399984656324492,
"loss": 3.3137,
"step": 310
},
{
"epoch": 0.01,
"learning_rate": 0.0003999836504572844,
"loss": 3.2644,
"step": 320
},
{
"epoch": 0.01,
"learning_rate": 0.00039998261265955195,
"loss": 3.21,
"step": 330
},
{
"epoch": 0.01,
"learning_rate": 0.00039998154293146036,
"loss": 3.1669,
"step": 340
},
{
"epoch": 0.01,
"learning_rate": 0.0003999804412731805,
"loss": 3.129,
"step": 350
},
{
"epoch": 0.01,
"learning_rate": 0.00039997930768488827,
"loss": 3.0799,
"step": 360
},
{
"epoch": 0.01,
"learning_rate": 0.0003999781421667645,
"loss": 3.0365,
"step": 370
},
{
"epoch": 0.01,
"learning_rate": 0.0003999769447189955,
"loss": 3.0261,
"step": 380
},
{
"epoch": 0.01,
"learning_rate": 0.0003999757153417723,
"loss": 2.9473,
"step": 390
},
{
"epoch": 0.01,
"learning_rate": 0.00039997445403529134,
"loss": 2.9481,
"step": 400
},
{
"epoch": 0.01,
"learning_rate": 0.00039997316079975386,
"loss": 2.9903,
"step": 410
},
{
"epoch": 0.01,
"learning_rate": 0.0003999718356353664,
"loss": 2.9129,
"step": 420
},
{
"epoch": 0.01,
"learning_rate": 0.0003999704785423406,
"loss": 2.9139,
"step": 430
},
{
"epoch": 0.01,
"learning_rate": 0.00039996908952089305,
"loss": 2.8713,
"step": 440
},
{
"epoch": 0.01,
"learning_rate": 0.0003999676685712456,
"loss": 2.8558,
"step": 450
},
{
"epoch": 0.01,
"learning_rate": 0.00039996621569362504,
"loss": 2.8262,
"step": 460
},
{
"epoch": 0.01,
"learning_rate": 0.0003999647308882635,
"loss": 2.7535,
"step": 470
},
{
"epoch": 0.01,
"learning_rate": 0.0003999632141553979,
"loss": 2.7011,
"step": 480
},
{
"epoch": 0.01,
"learning_rate": 0.00039996166549527044,
"loss": 2.7204,
"step": 490
},
{
"epoch": 0.01,
"learning_rate": 0.0003999600849081285,
"loss": 2.6671,
"step": 500
},
{
"epoch": 0.01,
"learning_rate": 0.00039995847239422417,
"loss": 2.6731,
"step": 510
},
{
"epoch": 0.01,
"learning_rate": 0.0003999568279538153,
"loss": 2.673,
"step": 520
},
{
"epoch": 0.01,
"learning_rate": 0.00039995515158716417,
"loss": 2.6212,
"step": 530
},
{
"epoch": 0.01,
"learning_rate": 0.00039995344329453854,
"loss": 2.6267,
"step": 540
},
{
"epoch": 0.01,
"learning_rate": 0.00039995170307621114,
"loss": 2.5705,
"step": 550
},
{
"epoch": 0.01,
"learning_rate": 0.0003999499309324598,
"loss": 2.539,
"step": 560
},
{
"epoch": 0.01,
"learning_rate": 0.0003999481268635675,
"loss": 2.6173,
"step": 570
},
{
"epoch": 0.01,
"learning_rate": 0.0003999462908698223,
"loss": 2.5528,
"step": 580
},
{
"epoch": 0.02,
"learning_rate": 0.00039994442295151735,
"loss": 2.5721,
"step": 590
},
{
"epoch": 0.02,
"learning_rate": 0.00039994252310895093,
"loss": 2.5522,
"step": 600
},
{
"epoch": 0.02,
"learning_rate": 0.0003999405913424262,
"loss": 2.5407,
"step": 610
},
{
"epoch": 0.02,
"learning_rate": 0.0003999386276522518,
"loss": 2.4878,
"step": 620
},
{
"epoch": 0.02,
"learning_rate": 0.00039993663203874116,
"loss": 2.4971,
"step": 630
},
{
"epoch": 0.02,
"learning_rate": 0.00039993460450221294,
"loss": 2.5823,
"step": 640
},
{
"epoch": 0.02,
"learning_rate": 0.00039993254504299086,
"loss": 2.4845,
"step": 650
},
{
"epoch": 0.02,
"learning_rate": 0.00039993045366140375,
"loss": 2.4013,
"step": 660
},
{
"epoch": 0.02,
"learning_rate": 0.0003999283303577856,
"loss": 2.4794,
"step": 670
},
{
"epoch": 0.02,
"learning_rate": 0.00039992617513247525,
"loss": 2.3949,
"step": 680
},
{
"epoch": 0.02,
"learning_rate": 0.00039992398798581696,
"loss": 2.4262,
"step": 690
},
{
"epoch": 0.02,
"learning_rate": 0.00039992176891815996,
"loss": 2.3461,
"step": 700
},
{
"epoch": 0.02,
"learning_rate": 0.00039991951792985844,
"loss": 2.3657,
"step": 710
},
{
"epoch": 0.02,
"learning_rate": 0.00039991723502127193,
"loss": 2.3655,
"step": 720
},
{
"epoch": 0.02,
"learning_rate": 0.00039991492019276483,
"loss": 2.407,
"step": 730
},
{
"epoch": 0.02,
"learning_rate": 0.0003999125734447068,
"loss": 2.3978,
"step": 740
},
{
"epoch": 0.02,
"learning_rate": 0.00039991019477747254,
"loss": 2.3706,
"step": 750
},
{
"epoch": 0.02,
"learning_rate": 0.0003999077841914418,
"loss": 2.3245,
"step": 760
},
{
"epoch": 0.02,
"learning_rate": 0.0003999053416869995,
"loss": 2.3302,
"step": 770
},
{
"epoch": 0.02,
"learning_rate": 0.00039990286726453557,
"loss": 2.3197,
"step": 780
},
{
"epoch": 0.02,
"learning_rate": 0.0003999003609244452,
"loss": 2.3013,
"step": 790
},
{
"epoch": 0.02,
"learning_rate": 0.0003998978226671284,
"loss": 2.3447,
"step": 800
},
{
"epoch": 0.02,
"learning_rate": 0.0003998952524929906,
"loss": 2.3617,
"step": 810
},
{
"epoch": 0.02,
"learning_rate": 0.0003998926504024421,
"loss": 2.3264,
"step": 820
},
{
"epoch": 0.02,
"learning_rate": 0.0003998900163958984,
"loss": 2.2853,
"step": 830
},
{
"epoch": 0.02,
"learning_rate": 0.00039988735047377996,
"loss": 2.3353,
"step": 840
},
{
"epoch": 0.02,
"learning_rate": 0.00039988465263651263,
"loss": 2.3734,
"step": 850
},
{
"epoch": 0.02,
"learning_rate": 0.0003998819228845269,
"loss": 2.2988,
"step": 860
},
{
"epoch": 0.02,
"learning_rate": 0.0003998791612182589,
"loss": 2.2447,
"step": 870
},
{
"epoch": 0.02,
"learning_rate": 0.0003998763676381493,
"loss": 2.2734,
"step": 880
},
{
"epoch": 0.02,
"learning_rate": 0.0003998735421446444,
"loss": 2.3543,
"step": 890
},
{
"epoch": 0.02,
"learning_rate": 0.00039987068473819516,
"loss": 2.3313,
"step": 900
},
{
"epoch": 0.02,
"learning_rate": 0.00039986779541925784,
"loss": 2.2926,
"step": 910
},
{
"epoch": 0.02,
"learning_rate": 0.0003998648741882938,
"loss": 2.2219,
"step": 920
},
{
"epoch": 0.02,
"learning_rate": 0.0003998619210457695,
"loss": 2.1825,
"step": 930
},
{
"epoch": 0.02,
"learning_rate": 0.0003998589359921563,
"loss": 2.2279,
"step": 940
},
{
"epoch": 0.02,
"learning_rate": 0.0003998559190279309,
"loss": 2.18,
"step": 950
},
{
"epoch": 0.02,
"learning_rate": 0.0003998528701535751,
"loss": 2.2806,
"step": 960
},
{
"epoch": 0.02,
"learning_rate": 0.00039984978936957554,
"loss": 2.2602,
"step": 970
},
{
"epoch": 0.02,
"learning_rate": 0.00039984667667642424,
"loss": 2.2162,
"step": 980
},
{
"epoch": 0.03,
"learning_rate": 0.0003998435320746181,
"loss": 2.2006,
"step": 990
},
{
"epoch": 0.03,
"learning_rate": 0.00039984035556465925,
"loss": 2.2247,
"step": 1000
},
{
"epoch": 0.03,
"learning_rate": 0.00039983714714705485,
"loss": 2.2148,
"step": 1010
},
{
"epoch": 0.03,
"learning_rate": 0.00039983390682231726,
"loss": 2.1869,
"step": 1020
},
{
"epoch": 0.03,
"learning_rate": 0.00039983063459096376,
"loss": 2.1773,
"step": 1030
},
{
"epoch": 0.03,
"learning_rate": 0.00039982733045351677,
"loss": 2.1144,
"step": 1040
},
{
"epoch": 0.03,
"learning_rate": 0.00039982399441050397,
"loss": 2.1769,
"step": 1050
},
{
"epoch": 0.03,
"learning_rate": 0.0003998206264624579,
"loss": 2.1617,
"step": 1060
},
{
"epoch": 0.03,
"learning_rate": 0.00039981722660991634,
"loss": 2.1872,
"step": 1070
},
{
"epoch": 0.03,
"learning_rate": 0.00039981379485342223,
"loss": 2.1406,
"step": 1080
},
{
"epoch": 0.03,
"learning_rate": 0.00039981033119352335,
"loss": 2.1804,
"step": 1090
},
{
"epoch": 0.03,
"learning_rate": 0.00039980683563077286,
"loss": 2.1605,
"step": 1100
},
{
"epoch": 0.03,
"learning_rate": 0.0003998033081657288,
"loss": 2.159,
"step": 1110
},
{
"epoch": 0.03,
"learning_rate": 0.0003997997487989543,
"loss": 2.1817,
"step": 1120
},
{
"epoch": 0.03,
"learning_rate": 0.0003997961575310179,
"loss": 2.1418,
"step": 1130
},
{
"epoch": 0.03,
"learning_rate": 0.00039979253436249284,
"loss": 2.166,
"step": 1140
},
{
"epoch": 0.03,
"learning_rate": 0.0003997888792939576,
"loss": 2.1097,
"step": 1150
},
{
"epoch": 0.03,
"learning_rate": 0.00039978519232599584,
"loss": 2.1546,
"step": 1160
},
{
"epoch": 0.03,
"learning_rate": 0.00039978147345919626,
"loss": 2.1352,
"step": 1170
},
{
"epoch": 0.03,
"learning_rate": 0.00039977772269415255,
"loss": 2.1723,
"step": 1180
},
{
"epoch": 0.03,
"learning_rate": 0.00039977394003146366,
"loss": 2.1614,
"step": 1190
},
{
"epoch": 0.03,
"learning_rate": 0.00039977012547173346,
"loss": 2.0975,
"step": 1200
},
{
"epoch": 0.03,
"learning_rate": 0.00039976627901557114,
"loss": 2.1734,
"step": 1210
},
{
"epoch": 0.03,
"learning_rate": 0.0003997624006635907,
"loss": 2.1047,
"step": 1220
},
{
"epoch": 0.03,
"learning_rate": 0.00039975849041641153,
"loss": 2.1343,
"step": 1230
},
{
"epoch": 0.03,
"learning_rate": 0.00039975454827465777,
"loss": 2.0768,
"step": 1240
},
{
"epoch": 0.03,
"learning_rate": 0.00039975057423895905,
"loss": 2.1033,
"step": 1250
},
{
"epoch": 0.03,
"learning_rate": 0.0003997465683099497,
"loss": 2.1109,
"step": 1260
},
{
"epoch": 0.03,
"learning_rate": 0.00039974253048826944,
"loss": 2.1695,
"step": 1270
},
{
"epoch": 0.03,
"learning_rate": 0.00039973846077456305,
"loss": 2.1097,
"step": 1280
},
{
"epoch": 0.03,
"learning_rate": 0.00039973435916948013,
"loss": 2.0771,
"step": 1290
},
{
"epoch": 0.03,
"learning_rate": 0.0003997302256736757,
"loss": 2.1272,
"step": 1300
},
{
"epoch": 0.03,
"learning_rate": 0.00039972606028780967,
"loss": 2.1463,
"step": 1310
},
{
"epoch": 0.03,
"learning_rate": 0.00039972186301254713,
"loss": 2.0335,
"step": 1320
},
{
"epoch": 0.03,
"learning_rate": 0.00039971763384855823,
"loss": 2.0344,
"step": 1330
},
{
"epoch": 0.03,
"learning_rate": 0.0003997133727965183,
"loss": 2.145,
"step": 1340
},
{
"epoch": 0.03,
"learning_rate": 0.0003997090798571076,
"loss": 2.1313,
"step": 1350
},
{
"epoch": 0.03,
"learning_rate": 0.00039970475503101164,
"loss": 2.1041,
"step": 1360
},
{
"epoch": 0.03,
"learning_rate": 0.00039970039831892083,
"loss": 2.0418,
"step": 1370
},
{
"epoch": 0.04,
"learning_rate": 0.0003996960097215309,
"loss": 2.0691,
"step": 1380
},
{
"epoch": 0.04,
"learning_rate": 0.0003996915892395425,
"loss": 2.0488,
"step": 1390
},
{
"epoch": 0.04,
"learning_rate": 0.00039968713687366135,
"loss": 2.104,
"step": 1400
},
{
"epoch": 0.04,
"learning_rate": 0.0003996826526245985,
"loss": 2.107,
"step": 1410
},
{
"epoch": 0.04,
"learning_rate": 0.00039967813649306993,
"loss": 2.023,
"step": 1420
},
{
"epoch": 0.04,
"learning_rate": 0.00039967358847979654,
"loss": 2.1037,
"step": 1430
},
{
"epoch": 0.04,
"learning_rate": 0.00039966900858550466,
"loss": 2.1022,
"step": 1440
},
{
"epoch": 0.04,
"learning_rate": 0.0003996643968109254,
"loss": 2.1065,
"step": 1450
},
{
"epoch": 0.04,
"learning_rate": 0.0003996597531567953,
"loss": 2.0959,
"step": 1460
},
{
"epoch": 0.04,
"learning_rate": 0.00039965507762385554,
"loss": 2.0508,
"step": 1470
},
{
"epoch": 0.04,
"learning_rate": 0.00039965037021285287,
"loss": 2.0902,
"step": 1480
},
{
"epoch": 0.04,
"learning_rate": 0.00039964563092453876,
"loss": 2.0,
"step": 1490
},
{
"epoch": 0.04,
"learning_rate": 0.0003996408597596699,
"loss": 2.0425,
"step": 1500
},
{
"epoch": 0.04,
"learning_rate": 0.0003996360567190082,
"loss": 2.0393,
"step": 1510
},
{
"epoch": 0.04,
"learning_rate": 0.0003996312218033205,
"loss": 2.0449,
"step": 1520
},
{
"epoch": 0.04,
"learning_rate": 0.00039962635501337876,
"loss": 2.1379,
"step": 1530
},
{
"epoch": 0.04,
"learning_rate": 0.00039962145634995995,
"loss": 2.1077,
"step": 1540
},
{
"epoch": 0.04,
"learning_rate": 0.00039961652581384634,
"loss": 2.0143,
"step": 1550
},
{
"epoch": 0.04,
"learning_rate": 0.00039961156340582516,
"loss": 2.0115,
"step": 1560
},
{
"epoch": 0.04,
"learning_rate": 0.00039960656912668864,
"loss": 1.9883,
"step": 1570
},
{
"epoch": 0.04,
"learning_rate": 0.0003996015429772343,
"loss": 2.0018,
"step": 1580
},
{
"epoch": 0.04,
"learning_rate": 0.0003995964849582646,
"loss": 1.9963,
"step": 1590
},
{
"epoch": 0.04,
"learning_rate": 0.00039959139507058706,
"loss": 1.9747,
"step": 1600
},
{
"epoch": 0.04,
"learning_rate": 0.0003995862733150145,
"loss": 2.0209,
"step": 1610
},
{
"epoch": 0.04,
"learning_rate": 0.00039958111969236456,
"loss": 2.05,
"step": 1620
},
{
"epoch": 0.04,
"learning_rate": 0.00039957593420346024,
"loss": 2.0429,
"step": 1630
},
{
"epoch": 0.04,
"learning_rate": 0.0003995707168491293,
"loss": 2.0028,
"step": 1640
},
{
"epoch": 0.04,
"learning_rate": 0.0003995654676302049,
"loss": 1.9768,
"step": 1650
},
{
"epoch": 0.04,
"learning_rate": 0.0003995601865475252,
"loss": 1.9517,
"step": 1660
},
{
"epoch": 0.04,
"learning_rate": 0.0003995548736019333,
"loss": 1.9689,
"step": 1670
},
{
"epoch": 0.04,
"learning_rate": 0.00039954952879427754,
"loss": 2.0619,
"step": 1680
},
{
"epoch": 0.04,
"learning_rate": 0.0003995441521254113,
"loss": 2.0244,
"step": 1690
},
{
"epoch": 0.04,
"learning_rate": 0.000399538743596193,
"loss": 2.0409,
"step": 1700
},
{
"epoch": 0.04,
"learning_rate": 0.00039953330320748623,
"loss": 2.1296,
"step": 1710
},
{
"epoch": 0.04,
"learning_rate": 0.00039952783096015973,
"loss": 2.0191,
"step": 1720
},
{
"epoch": 0.04,
"learning_rate": 0.00039952232685508713,
"loss": 2.036,
"step": 1730
},
{
"epoch": 0.04,
"learning_rate": 0.00039951679089314724,
"loss": 2.0275,
"step": 1740
},
{
"epoch": 0.04,
"learning_rate": 0.000399511223075224,
"loss": 2.0407,
"step": 1750
},
{
"epoch": 0.04,
"learning_rate": 0.0003995056234022064,
"loss": 1.9921,
"step": 1760
},
{
"epoch": 0.05,
"learning_rate": 0.00039949999187498844,
"loss": 1.9966,
"step": 1770
},
{
"epoch": 0.05,
"learning_rate": 0.0003994943284944694,
"loss": 1.9928,
"step": 1780
},
{
"epoch": 0.05,
"learning_rate": 0.0003994886332615535,
"loss": 1.9647,
"step": 1790
},
{
"epoch": 0.05,
"learning_rate": 0.0003994829061771499,
"loss": 1.9395,
"step": 1800
},
{
"epoch": 0.05,
"learning_rate": 0.0003994771472421733,
"loss": 1.9352,
"step": 1810
},
{
"epoch": 0.05,
"learning_rate": 0.00039947135645754305,
"loss": 1.9727,
"step": 1820
},
{
"epoch": 0.05,
"learning_rate": 0.00039946553382418375,
"loss": 2.0184,
"step": 1830
},
{
"epoch": 0.05,
"learning_rate": 0.00039945967934302504,
"loss": 1.9454,
"step": 1840
},
{
"epoch": 0.05,
"learning_rate": 0.0003994537930150018,
"loss": 1.9974,
"step": 1850
},
{
"epoch": 0.05,
"learning_rate": 0.00039944787484105375,
"loss": 1.9923,
"step": 1860
},
{
"epoch": 0.05,
"learning_rate": 0.00039944192482212584,
"loss": 1.9374,
"step": 1870
},
{
"epoch": 0.05,
"learning_rate": 0.0003994359429591681,
"loss": 2.035,
"step": 1880
},
{
"epoch": 0.05,
"learning_rate": 0.0003994299292531357,
"loss": 1.9926,
"step": 1890
},
{
"epoch": 0.05,
"learning_rate": 0.00039942388370498873,
"loss": 1.9723,
"step": 1900
},
{
"epoch": 0.05,
"learning_rate": 0.0003994178063156925,
"loss": 1.9967,
"step": 1910
},
{
"epoch": 0.05,
"learning_rate": 0.0003994116970862173,
"loss": 2.0542,
"step": 1920
},
{
"epoch": 0.05,
"learning_rate": 0.0003994055560175387,
"loss": 2.0183,
"step": 1930
},
{
"epoch": 0.05,
"learning_rate": 0.000399399383110637,
"loss": 2.0496,
"step": 1940
},
{
"epoch": 0.05,
"learning_rate": 0.000399393178366498,
"loss": 1.9737,
"step": 1950
},
{
"epoch": 0.05,
"learning_rate": 0.0003993869417861123,
"loss": 1.9379,
"step": 1960
},
{
"epoch": 0.05,
"learning_rate": 0.0003993806733704757,
"loss": 2.0235,
"step": 1970
},
{
"epoch": 0.05,
"learning_rate": 0.00039937437312058903,
"loss": 2.0181,
"step": 1980
},
{
"epoch": 0.05,
"learning_rate": 0.00039936804103745825,
"loss": 2.0143,
"step": 1990
},
{
"epoch": 0.05,
"learning_rate": 0.0003993616771220944,
"loss": 1.9753,
"step": 2000
},
{
"epoch": 0.05,
"learning_rate": 0.0003993552813755134,
"loss": 1.9484,
"step": 2010
},
{
"epoch": 0.05,
"learning_rate": 0.0003993488537987366,
"loss": 1.964,
"step": 2020
},
{
"epoch": 0.05,
"learning_rate": 0.00039934239439279025,
"loss": 1.9522,
"step": 2030
},
{
"epoch": 0.05,
"learning_rate": 0.00039933590315870566,
"loss": 2.0053,
"step": 2040
},
{
"epoch": 0.05,
"learning_rate": 0.0003993293800975193,
"loss": 1.9453,
"step": 2050
},
{
"epoch": 0.05,
"learning_rate": 0.00039932282521027264,
"loss": 1.8729,
"step": 2060
},
{
"epoch": 0.05,
"learning_rate": 0.00039931623849801223,
"loss": 2.0346,
"step": 2070
},
{
"epoch": 0.05,
"learning_rate": 0.0003993096199617899,
"loss": 1.9505,
"step": 2080
},
{
"epoch": 0.05,
"learning_rate": 0.00039930296960266224,
"loss": 1.9834,
"step": 2090
},
{
"epoch": 0.05,
"learning_rate": 0.0003992962874216911,
"loss": 1.9864,
"step": 2100
},
{
"epoch": 0.05,
"learning_rate": 0.0003992895734199434,
"loss": 1.9417,
"step": 2110
},
{
"epoch": 0.05,
"learning_rate": 0.00039928282759849125,
"loss": 1.9021,
"step": 2120
},
{
"epoch": 0.05,
"learning_rate": 0.0003992760499584116,
"loss": 1.9185,
"step": 2130
},
{
"epoch": 0.05,
"learning_rate": 0.0003992692405007867,
"loss": 1.9042,
"step": 2140
},
{
"epoch": 0.05,
"learning_rate": 0.00039926239922670367,
"loss": 1.9236,
"step": 2150
},
{
"epoch": 0.05,
"learning_rate": 0.00039925552613725496,
"loss": 2.0219,
"step": 2160
},
{
"epoch": 0.06,
"learning_rate": 0.00039924862123353786,
"loss": 1.9175,
"step": 2170
},
{
"epoch": 0.06,
"learning_rate": 0.0003992416845166549,
"loss": 1.9339,
"step": 2180
},
{
"epoch": 0.06,
"learning_rate": 0.0003992347159877136,
"loss": 1.8682,
"step": 2190
},
{
"epoch": 0.06,
"learning_rate": 0.00039922771564782663,
"loss": 1.9337,
"step": 2200
},
{
"epoch": 0.06,
"learning_rate": 0.0003992206834981118,
"loss": 1.9015,
"step": 2210
},
{
"epoch": 0.06,
"learning_rate": 0.0003992136195396916,
"loss": 1.9687,
"step": 2220
},
{
"epoch": 0.06,
"learning_rate": 0.00039920652377369427,
"loss": 1.9354,
"step": 2230
},
{
"epoch": 0.06,
"learning_rate": 0.0003991993962012525,
"loss": 1.9228,
"step": 2240
},
{
"epoch": 0.06,
"learning_rate": 0.00039919223682350437,
"loss": 1.9972,
"step": 2250
},
{
"epoch": 0.06,
"learning_rate": 0.000399185045641593,
"loss": 1.9626,
"step": 2260
},
{
"epoch": 0.06,
"learning_rate": 0.0003991778226566667,
"loss": 1.9724,
"step": 2270
},
{
"epoch": 0.06,
"learning_rate": 0.00039917056786987863,
"loss": 1.8766,
"step": 2280
},
{
"epoch": 0.06,
"learning_rate": 0.00039916328128238704,
"loss": 1.8419,
"step": 2290
},
{
"epoch": 0.06,
"learning_rate": 0.0003991559628953555,
"loss": 1.855,
"step": 2300
},
{
"epoch": 0.06,
"learning_rate": 0.0003991486127099525,
"loss": 1.966,
"step": 2310
},
{
"epoch": 0.06,
"learning_rate": 0.0003991412307273515,
"loss": 1.9684,
"step": 2320
},
{
"epoch": 0.06,
"learning_rate": 0.00039913381694873113,
"loss": 1.9364,
"step": 2330
},
{
"epoch": 0.06,
"learning_rate": 0.0003991263713752753,
"loss": 1.9438,
"step": 2340
},
{
"epoch": 0.06,
"learning_rate": 0.0003991188940081727,
"loss": 1.9081,
"step": 2350
},
{
"epoch": 0.06,
"learning_rate": 0.00039911138484861707,
"loss": 1.8837,
"step": 2360
},
{
"epoch": 0.06,
"learning_rate": 0.00039910384389780764,
"loss": 1.8935,
"step": 2370
},
{
"epoch": 0.06,
"learning_rate": 0.0003990962711569483,
"loss": 1.8873,
"step": 2380
},
{
"epoch": 0.06,
"learning_rate": 0.0003990886666272481,
"loss": 1.8894,
"step": 2390
},
{
"epoch": 0.06,
"learning_rate": 0.0003990810303099213,
"loss": 1.9223,
"step": 2400
},
{
"epoch": 0.06,
"learning_rate": 0.0003990733622061872,
"loss": 1.8759,
"step": 2410
},
{
"epoch": 0.06,
"learning_rate": 0.0003990656623172701,
"loss": 1.9005,
"step": 2420
},
{
"epoch": 0.06,
"learning_rate": 0.0003990579306443993,
"loss": 1.9132,
"step": 2430
},
{
"epoch": 0.06,
"learning_rate": 0.00039905016718880937,
"loss": 1.8808,
"step": 2440
},
{
"epoch": 0.06,
"learning_rate": 0.0003990423719517399,
"loss": 1.891,
"step": 2450
},
{
"epoch": 0.06,
"learning_rate": 0.0003990345449344355,
"loss": 1.8811,
"step": 2460
},
{
"epoch": 0.06,
"learning_rate": 0.0003990266861381459,
"loss": 1.8724,
"step": 2470
},
{
"epoch": 0.06,
"learning_rate": 0.00039901879556412576,
"loss": 1.9121,
"step": 2480
},
{
"epoch": 0.06,
"learning_rate": 0.00039901087321363507,
"loss": 1.9112,
"step": 2490
},
{
"epoch": 0.06,
"learning_rate": 0.0003990029190879387,
"loss": 1.931,
"step": 2500
},
{
"epoch": 0.06,
"learning_rate": 0.00039899493318830664,
"loss": 1.8886,
"step": 2510
},
{
"epoch": 0.06,
"learning_rate": 0.00039898691551601396,
"loss": 1.953,
"step": 2520
},
{
"epoch": 0.06,
"learning_rate": 0.0003989788660723408,
"loss": 1.9476,
"step": 2530
},
{
"epoch": 0.06,
"learning_rate": 0.00039897078485857257,
"loss": 1.916,
"step": 2540
},
{
"epoch": 0.06,
"learning_rate": 0.0003989626718759993,
"loss": 1.9513,
"step": 2550
},
{
"epoch": 0.07,
"learning_rate": 0.00039895452712591646,
"loss": 1.9378,
"step": 2560
},
{
"epoch": 0.07,
"learning_rate": 0.0003989463506096244,
"loss": 1.9231,
"step": 2570
},
{
"epoch": 0.07,
"learning_rate": 0.00039893814232842877,
"loss": 1.8928,
"step": 2580
},
{
"epoch": 0.07,
"learning_rate": 0.0003989299022836401,
"loss": 1.8642,
"step": 2590
},
{
"epoch": 0.07,
"learning_rate": 0.00039892163047657405,
"loss": 1.8858,
"step": 2600
},
{
"epoch": 0.07,
"learning_rate": 0.0003989133269085513,
"loss": 1.9193,
"step": 2610
},
{
"epoch": 0.07,
"learning_rate": 0.0003989049915808977,
"loss": 1.8847,
"step": 2620
},
{
"epoch": 0.07,
"learning_rate": 0.0003988966244949441,
"loss": 1.9073,
"step": 2630
},
{
"epoch": 0.07,
"learning_rate": 0.0003988882256520264,
"loss": 1.9436,
"step": 2640
},
{
"epoch": 0.07,
"learning_rate": 0.00039887979505348565,
"loss": 1.9591,
"step": 2650
},
{
"epoch": 0.07,
"learning_rate": 0.0003988713327006679,
"loss": 1.8816,
"step": 2660
},
{
"epoch": 0.07,
"learning_rate": 0.0003988628385949242,
"loss": 1.9548,
"step": 2670
},
{
"epoch": 0.07,
"learning_rate": 0.00039885431273761095,
"loss": 1.9138,
"step": 2680
},
{
"epoch": 0.07,
"learning_rate": 0.0003988457551300894,
"loss": 2.158,
"step": 2690
},
{
"epoch": 0.07,
"learning_rate": 0.00039883716577372583,
"loss": 2.0845,
"step": 2700
},
{
"epoch": 0.07,
"learning_rate": 0.00039882854466989176,
"loss": 2.0201,
"step": 2710
},
{
"epoch": 0.07,
"learning_rate": 0.00039881989181996354,
"loss": 1.9172,
"step": 2720
},
{
"epoch": 0.07,
"learning_rate": 0.00039881120722532284,
"loss": 1.9177,
"step": 2730
},
{
"epoch": 0.07,
"learning_rate": 0.0003988024908873563,
"loss": 1.9243,
"step": 2740
},
{
"epoch": 0.07,
"learning_rate": 0.0003987937428074556,
"loss": 1.9129,
"step": 2750
},
{
"epoch": 0.07,
"learning_rate": 0.00039878496298701745,
"loss": 1.8875,
"step": 2760
},
{
"epoch": 0.07,
"learning_rate": 0.00039877615142744376,
"loss": 1.8674,
"step": 2770
},
{
"epoch": 0.07,
"learning_rate": 0.0003987673081301414,
"loss": 1.913,
"step": 2780
},
{
"epoch": 0.07,
"learning_rate": 0.00039875843309652233,
"loss": 1.9136,
"step": 2790
},
{
"epoch": 0.07,
"learning_rate": 0.0003987495263280037,
"loss": 1.8634,
"step": 2800
},
{
"epoch": 0.07,
"learning_rate": 0.00039874058782600745,
"loss": 1.8743,
"step": 2810
},
{
"epoch": 0.07,
"learning_rate": 0.00039873161759196085,
"loss": 1.7969,
"step": 2820
},
{
"epoch": 0.07,
"learning_rate": 0.00039872261562729606,
"loss": 1.9213,
"step": 2830
},
{
"epoch": 0.07,
"learning_rate": 0.00039871358193345046,
"loss": 1.8824,
"step": 2840
},
{
"epoch": 0.07,
"learning_rate": 0.0003987045165118664,
"loss": 1.9052,
"step": 2850
},
{
"epoch": 0.07,
"learning_rate": 0.00039869541936399136,
"loss": 1.8925,
"step": 2860
},
{
"epoch": 0.07,
"learning_rate": 0.0003986862904912777,
"loss": 1.8846,
"step": 2870
},
{
"epoch": 0.07,
"learning_rate": 0.0003986771298951832,
"loss": 1.8653,
"step": 2880
},
{
"epoch": 0.07,
"learning_rate": 0.0003986679375771703,
"loss": 1.8782,
"step": 2890
},
{
"epoch": 0.07,
"learning_rate": 0.00039865871353870683,
"loss": 1.959,
"step": 2900
},
{
"epoch": 0.07,
"learning_rate": 0.0003986494577812655,
"loss": 1.8541,
"step": 2910
},
{
"epoch": 0.07,
"learning_rate": 0.0003986401703063241,
"loss": 1.8953,
"step": 2920
},
{
"epoch": 0.07,
"learning_rate": 0.0003986308511153656,
"loss": 1.9349,
"step": 2930
},
{
"epoch": 0.07,
"learning_rate": 0.0003986215002098778,
"loss": 1.8454,
"step": 2940
},
{
"epoch": 0.08,
"learning_rate": 0.00039861211759135393,
"loss": 1.8725,
"step": 2950
},
{
"epoch": 0.08,
"learning_rate": 0.000398602703261292,
"loss": 1.9361,
"step": 2960
},
{
"epoch": 0.08,
"learning_rate": 0.000398593257221195,
"loss": 1.886,
"step": 2970
},
{
"epoch": 0.08,
"learning_rate": 0.00039858377947257133,
"loss": 1.9202,
"step": 2980
},
{
"epoch": 0.08,
"learning_rate": 0.0003985742700169342,
"loss": 1.9123,
"step": 2990
},
{
"epoch": 0.08,
"learning_rate": 0.00039856472885580195,
"loss": 1.9366,
"step": 3000
},
{
"epoch": 0.08,
"learning_rate": 0.000398555155990698,
"loss": 1.8829,
"step": 3010
},
{
"epoch": 0.08,
"learning_rate": 0.00039854555142315067,
"loss": 1.9275,
"step": 3020
},
{
"epoch": 0.08,
"learning_rate": 0.00039853591515469366,
"loss": 1.9413,
"step": 3030
},
{
"epoch": 0.08,
"learning_rate": 0.00039852624718686544,
"loss": 1.8638,
"step": 3040
},
{
"epoch": 0.08,
"learning_rate": 0.00039851654752120976,
"loss": 1.9021,
"step": 3050
},
{
"epoch": 0.08,
"learning_rate": 0.00039850681615927517,
"loss": 1.9018,
"step": 3060
},
{
"epoch": 0.08,
"learning_rate": 0.00039849705310261556,
"loss": 1.8719,
"step": 3070
},
{
"epoch": 0.08,
"learning_rate": 0.00039848725835278973,
"loss": 1.8713,
"step": 3080
},
{
"epoch": 0.08,
"learning_rate": 0.00039847743191136144,
"loss": 1.8903,
"step": 3090
},
{
"epoch": 0.08,
"learning_rate": 0.00039846757377989985,
"loss": 1.8186,
"step": 3100
},
{
"epoch": 0.08,
"learning_rate": 0.00039845768395997883,
"loss": 1.8552,
"step": 3110
},
{
"epoch": 0.08,
"learning_rate": 0.00039844776245317743,
"loss": 1.8805,
"step": 3120
},
{
"epoch": 0.08,
"learning_rate": 0.0003984378092610799,
"loss": 1.9612,
"step": 3130
},
{
"epoch": 0.08,
"learning_rate": 0.00039842782438527524,
"loss": 1.8515,
"step": 3140
},
{
"epoch": 0.08,
"learning_rate": 0.0003984178078273579,
"loss": 1.8468,
"step": 3150
},
{
"epoch": 0.08,
"learning_rate": 0.00039840775958892695,
"loss": 1.8468,
"step": 3160
},
{
"epoch": 0.08,
"learning_rate": 0.00039839767967158696,
"loss": 1.8479,
"step": 3170
},
{
"epoch": 0.08,
"learning_rate": 0.00039838756807694724,
"loss": 1.8419,
"step": 3180
},
{
"epoch": 0.08,
"learning_rate": 0.0003983774248066222,
"loss": 1.8238,
"step": 3190
},
{
"epoch": 0.08,
"learning_rate": 0.0003983672498622316,
"loss": 1.8248,
"step": 3200
},
{
"epoch": 0.08,
"learning_rate": 0.0003983570432453998,
"loss": 1.8572,
"step": 3210
},
{
"epoch": 0.08,
"learning_rate": 0.00039834680495775654,
"loss": 1.9093,
"step": 3220
},
{
"epoch": 0.08,
"learning_rate": 0.00039833653500093646,
"loss": 1.898,
"step": 3230
},
{
"epoch": 0.08,
"learning_rate": 0.00039832623337657946,
"loss": 1.847,
"step": 3240
},
{
"epoch": 0.08,
"learning_rate": 0.00039831590008633023,
"loss": 1.8168,
"step": 3250
},
{
"epoch": 0.08,
"learning_rate": 0.0003983055351318387,
"loss": 1.8661,
"step": 3260
},
{
"epoch": 0.08,
"learning_rate": 0.00039829513851475973,
"loss": 1.882,
"step": 3270
},
{
"epoch": 0.08,
"learning_rate": 0.00039828471023675336,
"loss": 1.8127,
"step": 3280
},
{
"epoch": 0.08,
"learning_rate": 0.0003982742502994847,
"loss": 1.8136,
"step": 3290
},
{
"epoch": 0.08,
"learning_rate": 0.00039826375870462364,
"loss": 1.8412,
"step": 3300
},
{
"epoch": 0.08,
"learning_rate": 0.00039825323545384555,
"loss": 1.8997,
"step": 3310
},
{
"epoch": 0.08,
"learning_rate": 0.00039824268054883044,
"loss": 1.8213,
"step": 3320
},
{
"epoch": 0.08,
"learning_rate": 0.0003982320939912637,
"loss": 1.8443,
"step": 3330
},
{
"epoch": 0.08,
"learning_rate": 0.00039822147578283564,
"loss": 1.8576,
"step": 3340
},
{
"epoch": 0.09,
"learning_rate": 0.0003982108259252415,
"loss": 1.8235,
"step": 3350
},
{
"epoch": 0.09,
"learning_rate": 0.0003982001444201818,
"loss": 1.8112,
"step": 3360
},
{
"epoch": 0.09,
"learning_rate": 0.00039818943126936204,
"loss": 1.8501,
"step": 3370
},
{
"epoch": 0.09,
"learning_rate": 0.0003981786864744926,
"loss": 1.9042,
"step": 3380
},
{
"epoch": 0.09,
"learning_rate": 0.00039816791003728923,
"loss": 1.8704,
"step": 3390
},
{
"epoch": 0.09,
"learning_rate": 0.00039815710195947244,
"loss": 1.8226,
"step": 3400
},
{
"epoch": 0.09,
"learning_rate": 0.00039814626224276797,
"loss": 1.8682,
"step": 3410
},
{
"epoch": 0.09,
"learning_rate": 0.00039813539088890646,
"loss": 1.822,
"step": 3420
},
{
"epoch": 0.09,
"learning_rate": 0.0003981244878996238,
"loss": 1.8546,
"step": 3430
},
{
"epoch": 0.09,
"learning_rate": 0.0003981135532766607,
"loss": 1.8375,
"step": 3440
},
{
"epoch": 0.09,
"learning_rate": 0.00039810258702176323,
"loss": 1.9318,
"step": 3450
},
{
"epoch": 0.09,
"learning_rate": 0.00039809158913668214,
"loss": 1.8846,
"step": 3460
},
{
"epoch": 0.09,
"learning_rate": 0.0003980805596231735,
"loss": 1.841,
"step": 3470
},
{
"epoch": 0.09,
"learning_rate": 0.0003980694984829984,
"loss": 1.7964,
"step": 3480
},
{
"epoch": 0.09,
"learning_rate": 0.00039805840571792276,
"loss": 1.8562,
"step": 3490
},
{
"epoch": 0.09,
"learning_rate": 0.0003980472813297179,
"loss": 1.8546,
"step": 3500
},
{
"epoch": 0.09,
"learning_rate": 0.00039803612532015984,
"loss": 1.8304,
"step": 3510
},
{
"epoch": 0.09,
"learning_rate": 0.00039802493769102994,
"loss": 1.8223,
"step": 3520
},
{
"epoch": 0.09,
"learning_rate": 0.0003980137184441143,
"loss": 1.8645,
"step": 3530
},
{
"epoch": 0.09,
"learning_rate": 0.0003980024675812046,
"loss": 1.8605,
"step": 3540
},
{
"epoch": 0.09,
"learning_rate": 0.0003979911851040968,
"loss": 1.8072,
"step": 3550
},
{
"epoch": 0.09,
"learning_rate": 0.0003979798710145926,
"loss": 1.8886,
"step": 3560
},
{
"epoch": 0.09,
"learning_rate": 0.0003979685253144984,
"loss": 1.7982,
"step": 3570
},
{
"epoch": 0.09,
"learning_rate": 0.0003979571480056257,
"loss": 1.8072,
"step": 3580
},
{
"epoch": 0.09,
"learning_rate": 0.0003979457390897911,
"loss": 1.8435,
"step": 3590
},
{
"epoch": 0.09,
"learning_rate": 0.0003979342985688161,
"loss": 1.832,
"step": 3600
},
{
"epoch": 0.09,
"learning_rate": 0.0003979228264445275,
"loss": 1.8205,
"step": 3610
},
{
"epoch": 0.09,
"learning_rate": 0.0003979113227187569,
"loss": 1.8369,
"step": 3620
},
{
"epoch": 0.09,
"learning_rate": 0.0003978997873933412,
"loss": 1.8123,
"step": 3630
},
{
"epoch": 0.09,
"learning_rate": 0.000397888220470122,
"loss": 1.7854,
"step": 3640
},
{
"epoch": 0.09,
"learning_rate": 0.0003978766219509463,
"loss": 1.7781,
"step": 3650
},
{
"epoch": 0.09,
"learning_rate": 0.0003978649918376658,
"loss": 1.8043,
"step": 3660
},
{
"epoch": 0.09,
"learning_rate": 0.00039785333013213765,
"loss": 1.8317,
"step": 3670
},
{
"epoch": 0.09,
"learning_rate": 0.00039784163683622374,
"loss": 1.7834,
"step": 3680
},
{
"epoch": 0.09,
"learning_rate": 0.0003978299119517911,
"loss": 1.8493,
"step": 3690
},
{
"epoch": 0.09,
"learning_rate": 0.00039781815548071167,
"loss": 1.7785,
"step": 3700
},
{
"epoch": 0.09,
"learning_rate": 0.0003978063674248627,
"loss": 1.8433,
"step": 3710
},
{
"epoch": 0.09,
"learning_rate": 0.0003977945477861263,
"loss": 1.7909,
"step": 3720
},
{
"epoch": 0.09,
"learning_rate": 0.0003977826965663896,
"loss": 1.83,
"step": 3730
},
{
"epoch": 0.1,
"learning_rate": 0.00039777081376754487,
"loss": 1.7919,
"step": 3740
},
{
"epoch": 0.1,
"learning_rate": 0.00039775889939148946,
"loss": 1.7883,
"step": 3750
},
{
"epoch": 0.1,
"learning_rate": 0.0003977469534401256,
"loss": 1.7908,
"step": 3760
},
{
"epoch": 0.1,
"learning_rate": 0.00039773497591536063,
"loss": 1.7429,
"step": 3770
},
{
"epoch": 0.1,
"learning_rate": 0.000397722966819107,
"loss": 1.8088,
"step": 3780
},
{
"epoch": 0.1,
"learning_rate": 0.00039771092615328217,
"loss": 1.8726,
"step": 3790
},
{
"epoch": 0.1,
"learning_rate": 0.0003976988539198086,
"loss": 1.8244,
"step": 3800
},
{
"epoch": 0.1,
"learning_rate": 0.0003976867501206138,
"loss": 1.8047,
"step": 3810
},
{
"epoch": 0.1,
"learning_rate": 0.0003976746147576303,
"loss": 1.8664,
"step": 3820
},
{
"epoch": 0.1,
"learning_rate": 0.0003976624478327958,
"loss": 1.8955,
"step": 3830
},
{
"epoch": 0.1,
"learning_rate": 0.00039765024934805283,
"loss": 1.8355,
"step": 3840
},
{
"epoch": 0.1,
"learning_rate": 0.00039763801930534917,
"loss": 1.8964,
"step": 3850
},
{
"epoch": 0.1,
"learning_rate": 0.00039762575770663737,
"loss": 1.9149,
"step": 3860
},
{
"epoch": 0.1,
"learning_rate": 0.00039761346455387535,
"loss": 1.878,
"step": 3870
},
{
"epoch": 0.1,
"learning_rate": 0.0003976011398490259,
"loss": 1.7572,
"step": 3880
},
{
"epoch": 0.1,
"learning_rate": 0.00039758878359405676,
"loss": 1.786,
"step": 3890
},
{
"epoch": 0.1,
"learning_rate": 0.00039757639579094084,
"loss": 1.8305,
"step": 3900
},
{
"epoch": 0.1,
"learning_rate": 0.0003975639764416561,
"loss": 1.7545,
"step": 3910
},
{
"epoch": 0.1,
"learning_rate": 0.00039755152554818543,
"loss": 1.7633,
"step": 3920
},
{
"epoch": 0.1,
"learning_rate": 0.0003975390431125168,
"loss": 1.7908,
"step": 3930
},
{
"epoch": 0.1,
"learning_rate": 0.0003975265291366432,
"loss": 1.8037,
"step": 3940
},
{
"epoch": 0.1,
"learning_rate": 0.0003975139836225628,
"loss": 1.8299,
"step": 3950
},
{
"epoch": 0.1,
"learning_rate": 0.00039750140657227853,
"loss": 1.7984,
"step": 3960
},
{
"epoch": 0.1,
"learning_rate": 0.00039748879798779866,
"loss": 1.8757,
"step": 3970
},
{
"epoch": 0.1,
"learning_rate": 0.0003974761578711363,
"loss": 1.764,
"step": 3980
},
{
"epoch": 0.1,
"learning_rate": 0.0003974634862243096,
"loss": 1.7753,
"step": 3990
},
{
"epoch": 0.1,
"learning_rate": 0.0003974507830493418,
"loss": 1.7783,
"step": 4000
},
{
"epoch": 0.1,
"learning_rate": 0.0003974380483482612,
"loss": 1.8552,
"step": 4010
},
{
"epoch": 0.1,
"learning_rate": 0.0003974252821231011,
"loss": 1.8819,
"step": 4020
},
{
"epoch": 0.1,
"learning_rate": 0.0003974124843758998,
"loss": 1.8257,
"step": 4030
},
{
"epoch": 0.1,
"learning_rate": 0.0003973996551087006,
"loss": 1.8212,
"step": 4040
},
{
"epoch": 0.1,
"learning_rate": 0.00039738679432355193,
"loss": 1.84,
"step": 4050
},
{
"epoch": 0.1,
"learning_rate": 0.0003973739020225073,
"loss": 1.8493,
"step": 4060
},
{
"epoch": 0.1,
"learning_rate": 0.0003973609782076251,
"loss": 1.8039,
"step": 4070
},
{
"epoch": 0.1,
"learning_rate": 0.0003973480228809689,
"loss": 1.7981,
"step": 4080
},
{
"epoch": 0.1,
"learning_rate": 0.00039733503604460704,
"loss": 1.8088,
"step": 4090
},
{
"epoch": 0.1,
"learning_rate": 0.0003973220177006132,
"loss": 1.7446,
"step": 4100
},
{
"epoch": 0.1,
"learning_rate": 0.00039730896785106597,
"loss": 1.8402,
"step": 4110
},
{
"epoch": 0.1,
"learning_rate": 0.00039729588649804894,
"loss": 1.799,
"step": 4120
},
{
"epoch": 0.11,
"learning_rate": 0.0003972827736436507,
"loss": 1.8064,
"step": 4130
},
{
"epoch": 0.11,
"learning_rate": 0.00039726962928996503,
"loss": 1.7946,
"step": 4140
},
{
"epoch": 0.11,
"learning_rate": 0.00039725645343909055,
"loss": 1.8193,
"step": 4150
},
{
"epoch": 0.11,
"learning_rate": 0.00039724324609313104,
"loss": 1.7728,
"step": 4160
},
{
"epoch": 0.11,
"learning_rate": 0.00039723000725419517,
"loss": 1.8646,
"step": 4170
},
{
"epoch": 0.11,
"learning_rate": 0.00039721673692439687,
"loss": 1.8326,
"step": 4180
},
{
"epoch": 0.11,
"learning_rate": 0.00039720343510585483,
"loss": 1.8098,
"step": 4190
},
{
"epoch": 0.11,
"learning_rate": 0.00039719010180069294,
"loss": 1.8529,
"step": 4200
},
{
"epoch": 0.11,
"learning_rate": 0.0003971767370110401,
"loss": 1.811,
"step": 4210
},
{
"epoch": 0.11,
"learning_rate": 0.00039716334073903016,
"loss": 1.8112,
"step": 4220
},
{
"epoch": 0.11,
"learning_rate": 0.0003971499129868021,
"loss": 1.7949,
"step": 4230
},
{
"epoch": 0.11,
"learning_rate": 0.00039713645375649985,
"loss": 1.8224,
"step": 4240
},
{
"epoch": 0.11,
"learning_rate": 0.0003971229630502723,
"loss": 1.8517,
"step": 4250
},
{
"epoch": 0.11,
"learning_rate": 0.0003971094408702736,
"loss": 1.8168,
"step": 4260
},
{
"epoch": 0.11,
"learning_rate": 0.00039709588721866267,
"loss": 1.7842,
"step": 4270
},
{
"epoch": 0.11,
"learning_rate": 0.00039708230209760365,
"loss": 1.8217,
"step": 4280
},
{
"epoch": 0.11,
"learning_rate": 0.0003970686855092655,
"loss": 1.7968,
"step": 4290
},
{
"epoch": 0.11,
"learning_rate": 0.0003970550374558224,
"loss": 1.7345,
"step": 4300
},
{
"epoch": 0.11,
"learning_rate": 0.0003970413579394535,
"loss": 1.8,
"step": 4310
},
{
"epoch": 0.11,
"learning_rate": 0.0003970276469623429,
"loss": 1.808,
"step": 4320
},
{
"epoch": 0.11,
"learning_rate": 0.0003970139045266798,
"loss": 1.8049,
"step": 4330
},
{
"epoch": 0.11,
"learning_rate": 0.0003970001306346583,
"loss": 1.8013,
"step": 4340
},
{
"epoch": 0.11,
"learning_rate": 0.0003969863252884778,
"loss": 1.8282,
"step": 4350
},
{
"epoch": 0.11,
"learning_rate": 0.0003969724884903424,
"loss": 1.8122,
"step": 4360
},
{
"epoch": 0.11,
"learning_rate": 0.0003969586202424614,
"loss": 1.8082,
"step": 4370
},
{
"epoch": 0.11,
"learning_rate": 0.0003969447205470491,
"loss": 1.7345,
"step": 4380
},
{
"epoch": 0.11,
"learning_rate": 0.00039693078940632474,
"loss": 1.7867,
"step": 4390
},
{
"epoch": 0.11,
"learning_rate": 0.0003969168268225127,
"loss": 1.7596,
"step": 4400
},
{
"epoch": 0.11,
"learning_rate": 0.0003969028327978424,
"loss": 1.8147,
"step": 4410
},
{
"epoch": 0.11,
"learning_rate": 0.0003968888073345481,
"loss": 1.8479,
"step": 4420
},
{
"epoch": 0.11,
"learning_rate": 0.00039687475043486916,
"loss": 1.7887,
"step": 4430
},
{
"epoch": 0.11,
"learning_rate": 0.00039686066210105006,
"loss": 1.8487,
"step": 4440
},
{
"epoch": 0.11,
"learning_rate": 0.00039684654233534017,
"loss": 1.8783,
"step": 4450
},
{
"epoch": 0.11,
"learning_rate": 0.00039683239113999394,
"loss": 1.7634,
"step": 4460
},
{
"epoch": 0.11,
"learning_rate": 0.0003968182085172709,
"loss": 1.7534,
"step": 4470
},
{
"epoch": 0.11,
"learning_rate": 0.0003968039944694355,
"loss": 1.7874,
"step": 4480
},
{
"epoch": 0.11,
"learning_rate": 0.00039678974899875715,
"loss": 1.7343,
"step": 4490
},
{
"epoch": 0.11,
"learning_rate": 0.0003967754721075105,
"loss": 1.7943,
"step": 4500
},
{
"epoch": 0.11,
"learning_rate": 0.00039676116379797494,
"loss": 1.7694,
"step": 4510
},
{
"epoch": 0.11,
"learning_rate": 0.0003967468240724351,
"loss": 1.823,
"step": 4520
},
{
"epoch": 0.12,
"learning_rate": 0.0003967324529331805,
"loss": 1.8147,
"step": 4530
},
{
"epoch": 0.12,
"learning_rate": 0.0003967180503825058,
"loss": 1.779,
"step": 4540
},
{
"epoch": 0.12,
"learning_rate": 0.0003967036164227105,
"loss": 1.7434,
"step": 4550
},
{
"epoch": 0.12,
"learning_rate": 0.00039668915105609925,
"loss": 1.8157,
"step": 4560
},
{
"epoch": 0.12,
"learning_rate": 0.0003966746542849816,
"loss": 1.7972,
"step": 4570
},
{
"epoch": 0.12,
"learning_rate": 0.0003966601261116723,
"loss": 1.768,
"step": 4580
},
{
"epoch": 0.12,
"learning_rate": 0.000396645566538491,
"loss": 1.7521,
"step": 4590
},
{
"epoch": 0.12,
"learning_rate": 0.0003966309755677623,
"loss": 1.8067,
"step": 4600
},
{
"epoch": 0.12,
"learning_rate": 0.0003966163532018158,
"loss": 1.7907,
"step": 4610
},
{
"epoch": 0.12,
"learning_rate": 0.0003966016994429864,
"loss": 1.7259,
"step": 4620
},
{
"epoch": 0.12,
"learning_rate": 0.0003965870142936136,
"loss": 1.8267,
"step": 4630
},
{
"epoch": 0.12,
"learning_rate": 0.0003965722977560422,
"loss": 1.7433,
"step": 4640
},
{
"epoch": 0.12,
"learning_rate": 0.000396557549832622,
"loss": 1.7433,
"step": 4650
},
{
"epoch": 0.12,
"learning_rate": 0.0003965427705257076,
"loss": 1.8058,
"step": 4660
},
{
"epoch": 0.12,
"learning_rate": 0.0003965279598376588,
"loss": 1.8024,
"step": 4670
},
{
"epoch": 0.12,
"learning_rate": 0.0003965131177708404,
"loss": 1.7884,
"step": 4680
},
{
"epoch": 0.12,
"learning_rate": 0.0003964982443276221,
"loss": 1.7835,
"step": 4690
},
{
"epoch": 0.12,
"learning_rate": 0.0003964833395103788,
"loss": 1.8312,
"step": 4700
},
{
"epoch": 0.12,
"learning_rate": 0.0003964684033214901,
"loss": 1.7961,
"step": 4710
},
{
"epoch": 0.12,
"learning_rate": 0.000396453435763341,
"loss": 1.7441,
"step": 4720
},
{
"epoch": 0.12,
"learning_rate": 0.00039643843683832115,
"loss": 1.7585,
"step": 4730
},
{
"epoch": 0.12,
"learning_rate": 0.00039642340654882544,
"loss": 1.7572,
"step": 4740
},
{
"epoch": 0.12,
"learning_rate": 0.00039640834489725366,
"loss": 1.8212,
"step": 4750
},
{
"epoch": 0.12,
"learning_rate": 0.0003963932518860106,
"loss": 1.741,
"step": 4760
},
{
"epoch": 0.12,
"learning_rate": 0.00039637812751750623,
"loss": 1.7815,
"step": 4770
},
{
"epoch": 0.12,
"learning_rate": 0.0003963629717941553,
"loss": 1.7903,
"step": 4780
},
{
"epoch": 0.12,
"learning_rate": 0.00039634778471837764,
"loss": 1.768,
"step": 4790
},
{
"epoch": 0.12,
"learning_rate": 0.0003963325662925981,
"loss": 1.7375,
"step": 4800
},
{
"epoch": 0.12,
"learning_rate": 0.00039631731651924666,
"loss": 1.7954,
"step": 4810
},
{
"epoch": 0.12,
"learning_rate": 0.000396302035400758,
"loss": 1.7816,
"step": 4820
},
{
"epoch": 0.12,
"learning_rate": 0.0003962867229395721,
"loss": 1.8233,
"step": 4830
},
{
"epoch": 0.12,
"learning_rate": 0.0003962713791381338,
"loss": 1.7717,
"step": 4840
},
{
"epoch": 0.12,
"learning_rate": 0.0003962560039988931,
"loss": 1.8,
"step": 4850
},
{
"epoch": 0.12,
"learning_rate": 0.00039624059752430473,
"loss": 1.7848,
"step": 4860
},
{
"epoch": 0.12,
"learning_rate": 0.0003962251597168286,
"loss": 1.7773,
"step": 4870
},
{
"epoch": 0.12,
"learning_rate": 0.00039620969057892967,
"loss": 1.8012,
"step": 4880
},
{
"epoch": 0.12,
"learning_rate": 0.0003961941901130778,
"loss": 1.7854,
"step": 4890
},
{
"epoch": 0.12,
"learning_rate": 0.0003961786583217478,
"loss": 1.7833,
"step": 4900
},
{
"epoch": 0.12,
"learning_rate": 0.0003961630952074197,
"loss": 1.7826,
"step": 4910
},
{
"epoch": 0.13,
"learning_rate": 0.00039614750077257826,
"loss": 1.8489,
"step": 4920
},
{
"epoch": 0.13,
"learning_rate": 0.00039613187501971346,
"loss": 1.8329,
"step": 4930
},
{
"epoch": 0.13,
"learning_rate": 0.0003961162179513202,
"loss": 1.7486,
"step": 4940
},
{
"epoch": 0.13,
"learning_rate": 0.0003961005295698984,
"loss": 1.7631,
"step": 4950
},
{
"epoch": 0.13,
"learning_rate": 0.00039608480987795284,
"loss": 1.7728,
"step": 4960
},
{
"epoch": 0.13,
"learning_rate": 0.0003960690588779935,
"loss": 1.7824,
"step": 4970
},
{
"epoch": 0.13,
"learning_rate": 0.0003960532765725352,
"loss": 1.7664,
"step": 4980
},
{
"epoch": 0.13,
"learning_rate": 0.000396037462964098,
"loss": 1.7976,
"step": 4990
},
{
"epoch": 0.13,
"learning_rate": 0.00039602161805520666,
"loss": 1.7426,
"step": 5000
},
{
"epoch": 0.13,
"learning_rate": 0.0003960057418483911,
"loss": 1.806,
"step": 5010
},
{
"epoch": 0.13,
"learning_rate": 0.0003959898343461862,
"loss": 1.7553,
"step": 5020
},
{
"epoch": 0.13,
"learning_rate": 0.0003959738955511318,
"loss": 1.7616,
"step": 5030
},
{
"epoch": 0.13,
"learning_rate": 0.00039595792546577276,
"loss": 1.767,
"step": 5040
},
{
"epoch": 0.13,
"learning_rate": 0.00039594192409265913,
"loss": 1.762,
"step": 5050
},
{
"epoch": 0.13,
"learning_rate": 0.00039592589143434565,
"loss": 1.8521,
"step": 5060
},
{
"epoch": 0.13,
"learning_rate": 0.0003959098274933921,
"loss": 2.1704,
"step": 5070
},
{
"epoch": 0.13,
"learning_rate": 0.00039589373227236354,
"loss": 1.8074,
"step": 5080
},
{
"epoch": 0.13,
"learning_rate": 0.0003958776057738297,
"loss": 1.854,
"step": 5090
},
{
"epoch": 0.13,
"learning_rate": 0.00039586144800036544,
"loss": 1.7474,
"step": 5100
},
{
"epoch": 0.13,
"learning_rate": 0.0003958452589545506,
"loss": 1.7693,
"step": 5110
},
{
"epoch": 0.13,
"learning_rate": 0.00039582903863897,
"loss": 1.7798,
"step": 5120
},
{
"epoch": 0.13,
"learning_rate": 0.00039581278705621355,
"loss": 1.7063,
"step": 5130
},
{
"epoch": 0.13,
"learning_rate": 0.000395796504208876,
"loss": 1.7747,
"step": 5140
},
{
"epoch": 0.13,
"learning_rate": 0.00039578019009955717,
"loss": 1.7526,
"step": 5150
},
{
"epoch": 0.13,
"learning_rate": 0.0003957638447308619,
"loss": 1.7272,
"step": 5160
},
{
"epoch": 0.13,
"learning_rate": 0.0003957474681053999,
"loss": 1.727,
"step": 5170
},
{
"epoch": 0.13,
"learning_rate": 0.000395731060225786,
"loss": 1.7858,
"step": 5180
},
{
"epoch": 0.13,
"learning_rate": 0.00039571462109464005,
"loss": 1.7879,
"step": 5190
},
{
"epoch": 0.13,
"learning_rate": 0.0003956981507145867,
"loss": 1.7494,
"step": 5200
},
{
"epoch": 0.13,
"learning_rate": 0.0003956816490882558,
"loss": 1.7716,
"step": 5210
},
{
"epoch": 0.13,
"learning_rate": 0.00039566511621828203,
"loss": 1.7585,
"step": 5220
},
{
"epoch": 0.13,
"learning_rate": 0.00039564855210730515,
"loss": 1.7309,
"step": 5230
},
{
"epoch": 0.13,
"learning_rate": 0.0003956319567579698,
"loss": 1.7503,
"step": 5240
},
{
"epoch": 0.13,
"learning_rate": 0.0003956153301729258,
"loss": 1.7631,
"step": 5250
},
{
"epoch": 0.13,
"learning_rate": 0.00039559867235482784,
"loss": 1.6883,
"step": 5260
},
{
"epoch": 0.13,
"learning_rate": 0.00039558198330633555,
"loss": 1.7946,
"step": 5270
},
{
"epoch": 0.13,
"learning_rate": 0.00039556526303011354,
"loss": 1.7962,
"step": 5280
},
{
"epoch": 0.13,
"learning_rate": 0.00039554851152883157,
"loss": 1.7869,
"step": 5290
},
{
"epoch": 0.13,
"learning_rate": 0.0003955317288051643,
"loss": 1.7507,
"step": 5300
},
{
"epoch": 0.14,
"learning_rate": 0.0003955149148617912,
"loss": 1.7614,
"step": 5310
},
{
"epoch": 0.14,
"learning_rate": 0.0003954980697013971,
"loss": 1.7288,
"step": 5320
},
{
"epoch": 0.14,
"learning_rate": 0.0003954811933266714,
"loss": 1.8662,
"step": 5330
},
{
"epoch": 0.14,
"learning_rate": 0.0003954642857403088,
"loss": 1.7724,
"step": 5340
},
{
"epoch": 0.14,
"learning_rate": 0.00039544734694500874,
"loss": 1.8028,
"step": 5350
},
{
"epoch": 0.14,
"learning_rate": 0.0003954303769434759,
"loss": 1.7507,
"step": 5360
},
{
"epoch": 0.14,
"learning_rate": 0.0003954133757384197,
"loss": 1.74,
"step": 5370
},
{
"epoch": 0.14,
"learning_rate": 0.0003953963433325547,
"loss": 1.7622,
"step": 5380
},
{
"epoch": 0.14,
"learning_rate": 0.00039537927972860043,
"loss": 1.7755,
"step": 5390
},
{
"epoch": 0.14,
"learning_rate": 0.0003953621849292813,
"loss": 1.7695,
"step": 5400
},
{
"epoch": 0.14,
"learning_rate": 0.00039534505893732677,
"loss": 1.7455,
"step": 5410
},
{
"epoch": 0.14,
"learning_rate": 0.0003953279017554713,
"loss": 1.8059,
"step": 5420
},
{
"epoch": 0.14,
"learning_rate": 0.0003953107133864542,
"loss": 1.7896,
"step": 5430
},
{
"epoch": 0.14,
"learning_rate": 0.00039529349383302006,
"loss": 1.7552,
"step": 5440
},
{
"epoch": 0.14,
"learning_rate": 0.00039527624309791806,
"loss": 1.7444,
"step": 5450
},
{
"epoch": 0.14,
"learning_rate": 0.00039525896118390266,
"loss": 1.714,
"step": 5460
},
{
"epoch": 0.14,
"learning_rate": 0.00039524164809373315,
"loss": 1.8,
"step": 5470
},
{
"epoch": 0.14,
"learning_rate": 0.0003952243038301738,
"loss": 1.7523,
"step": 5480
},
{
"epoch": 0.14,
"learning_rate": 0.000395206928395994,
"loss": 1.7239,
"step": 5490
},
{
"epoch": 0.14,
"learning_rate": 0.00039518952179396795,
"loss": 1.7658,
"step": 5500
},
{
"epoch": 0.14,
"learning_rate": 0.00039517208402687487,
"loss": 1.7425,
"step": 5510
},
{
"epoch": 0.14,
"learning_rate": 0.00039515461509749897,
"loss": 1.7497,
"step": 5520
},
{
"epoch": 0.14,
"learning_rate": 0.00039513711500862946,
"loss": 1.7329,
"step": 5530
},
{
"epoch": 0.14,
"learning_rate": 0.00039511958376306055,
"loss": 1.7289,
"step": 5540
},
{
"epoch": 0.14,
"learning_rate": 0.0003951020213635912,
"loss": 1.7602,
"step": 5550
},
{
"epoch": 0.14,
"learning_rate": 0.00039508442781302576,
"loss": 1.8221,
"step": 5560
},
{
"epoch": 0.14,
"learning_rate": 0.0003950668031141732,
"loss": 1.8125,
"step": 5570
},
{
"epoch": 0.14,
"learning_rate": 0.00039504914726984754,
"loss": 1.7633,
"step": 5580
},
{
"epoch": 0.14,
"learning_rate": 0.00039503146028286787,
"loss": 1.7667,
"step": 5590
},
{
"epoch": 0.14,
"learning_rate": 0.0003950137421560582,
"loss": 1.7723,
"step": 5600
},
{
"epoch": 0.14,
"learning_rate": 0.0003949959928922475,
"loss": 1.7784,
"step": 5610
},
{
"epoch": 0.14,
"learning_rate": 0.0003949782124942697,
"loss": 1.7889,
"step": 5620
},
{
"epoch": 0.14,
"learning_rate": 0.00039496040096496373,
"loss": 1.776,
"step": 5630
},
{
"epoch": 0.14,
"learning_rate": 0.00039494255830717346,
"loss": 1.7307,
"step": 5640
},
{
"epoch": 0.14,
"learning_rate": 0.00039492468452374775,
"loss": 1.7699,
"step": 5650
},
{
"epoch": 0.14,
"learning_rate": 0.00039490677961754044,
"loss": 1.7956,
"step": 5660
},
{
"epoch": 0.14,
"learning_rate": 0.0003948888435914104,
"loss": 1.7993,
"step": 5670
},
{
"epoch": 0.14,
"learning_rate": 0.00039487087644822126,
"loss": 1.7359,
"step": 5680
},
{
"epoch": 0.14,
"learning_rate": 0.0003948528781908419,
"loss": 1.7289,
"step": 5690
},
{
"epoch": 0.14,
"learning_rate": 0.0003948348488221459,
"loss": 1.8222,
"step": 5700
},
{
"epoch": 0.15,
"learning_rate": 0.00039481678834501203,
"loss": 1.768,
"step": 5710
},
{
"epoch": 0.15,
"learning_rate": 0.00039479869676232386,
"loss": 1.7411,
"step": 5720
},
{
"epoch": 0.15,
"learning_rate": 0.00039478057407697,
"loss": 1.7199,
"step": 5730
},
{
"epoch": 0.15,
"learning_rate": 0.000394762420291844,
"loss": 1.7037,
"step": 5740
},
{
"epoch": 0.15,
"learning_rate": 0.0003947442354098445,
"loss": 1.7107,
"step": 5750
},
{
"epoch": 0.15,
"learning_rate": 0.00039472601943387495,
"loss": 1.7102,
"step": 5760
},
{
"epoch": 0.15,
"learning_rate": 0.00039470777236684377,
"loss": 1.7865,
"step": 5770
},
{
"epoch": 0.15,
"learning_rate": 0.00039468949421166436,
"loss": 1.7583,
"step": 5780
},
{
"epoch": 0.15,
"learning_rate": 0.0003946711849712553,
"loss": 1.7925,
"step": 5790
},
{
"epoch": 0.15,
"learning_rate": 0.0003946528446485398,
"loss": 1.7322,
"step": 5800
},
{
"epoch": 0.15,
"learning_rate": 0.00039463447324644614,
"loss": 1.7248,
"step": 5810
},
{
"epoch": 0.15,
"learning_rate": 0.00039461607076790773,
"loss": 1.7555,
"step": 5820
},
{
"epoch": 0.15,
"learning_rate": 0.00039459763721586265,
"loss": 1.7371,
"step": 5830
},
{
"epoch": 0.15,
"learning_rate": 0.0003945791725932543,
"loss": 1.713,
"step": 5840
},
{
"epoch": 0.15,
"learning_rate": 0.0003945606769030307,
"loss": 1.7754,
"step": 5850
},
{
"epoch": 0.15,
"learning_rate": 0.00039454215014814506,
"loss": 1.7139,
"step": 5860
},
{
"epoch": 0.15,
"learning_rate": 0.0003945235923315554,
"loss": 1.7106,
"step": 5870
},
{
"epoch": 0.15,
"learning_rate": 0.00039450500345622485,
"loss": 1.7615,
"step": 5880
},
{
"epoch": 0.15,
"learning_rate": 0.0003944863835251214,
"loss": 1.7612,
"step": 5890
},
{
"epoch": 0.15,
"learning_rate": 0.0003944677325412179,
"loss": 1.7308,
"step": 5900
},
{
"epoch": 0.15,
"learning_rate": 0.0003944490505074924,
"loss": 1.7304,
"step": 5910
},
{
"epoch": 0.15,
"learning_rate": 0.00039443033742692774,
"loss": 1.7037,
"step": 5920
},
{
"epoch": 0.15,
"learning_rate": 0.00039441159330251167,
"loss": 1.725,
"step": 5930
},
{
"epoch": 0.15,
"learning_rate": 0.0003943928181372372,
"loss": 1.7284,
"step": 5940
},
{
"epoch": 0.15,
"learning_rate": 0.00039437401193410183,
"loss": 1.7352,
"step": 5950
},
{
"epoch": 0.15,
"learning_rate": 0.0003943551746961084,
"loss": 1.7093,
"step": 5960
},
{
"epoch": 0.15,
"learning_rate": 0.0003943363064262646,
"loss": 1.8308,
"step": 5970
},
{
"epoch": 0.15,
"learning_rate": 0.0003943174071275829,
"loss": 2.0842,
"step": 5980
},
{
"epoch": 0.15,
"learning_rate": 0.000394298476803081,
"loss": 1.7864,
"step": 5990
},
{
"epoch": 0.15,
"learning_rate": 0.0003942795154557814,
"loss": 1.6993,
"step": 6000
},
{
"epoch": 0.15,
"learning_rate": 0.0003942605230887116,
"loss": 1.7407,
"step": 6010
},
{
"epoch": 0.15,
"learning_rate": 0.00039424149970490396,
"loss": 1.695,
"step": 6020
},
{
"epoch": 0.15,
"learning_rate": 0.00039422244530739584,
"loss": 1.7297,
"step": 6030
},
{
"epoch": 0.15,
"learning_rate": 0.0003942033598992297,
"loss": 1.7075,
"step": 6040
},
{
"epoch": 0.15,
"learning_rate": 0.0003941842434834527,
"loss": 1.7377,
"step": 6050
},
{
"epoch": 0.15,
"learning_rate": 0.0003941650960631172,
"loss": 1.7527,
"step": 6060
},
{
"epoch": 0.15,
"learning_rate": 0.0003941459176412802,
"loss": 1.7288,
"step": 6070
},
{
"epoch": 0.15,
"learning_rate": 0.00039412670822100405,
"loss": 1.7208,
"step": 6080
},
{
"epoch": 0.15,
"learning_rate": 0.0003941074678053557,
"loss": 1.7281,
"step": 6090
},
{
"epoch": 0.16,
"learning_rate": 0.0003940881963974072,
"loss": 1.7372,
"step": 6100
},
{
"epoch": 0.16,
"learning_rate": 0.00039406889400023557,
"loss": 1.767,
"step": 6110
},
{
"epoch": 0.16,
"learning_rate": 0.00039404956061692267,
"loss": 1.6922,
"step": 6120
},
{
"epoch": 0.16,
"learning_rate": 0.0003940301962505555,
"loss": 1.6791,
"step": 6130
},
{
"epoch": 0.16,
"learning_rate": 0.00039401080090422573,
"loss": 1.7537,
"step": 6140
},
{
"epoch": 0.16,
"learning_rate": 0.00039399137458103026,
"loss": 1.7717,
"step": 6150
},
{
"epoch": 0.16,
"learning_rate": 0.00039397191728407076,
"loss": 1.7228,
"step": 6160
},
{
"epoch": 0.16,
"learning_rate": 0.0003939524290164539,
"loss": 1.7459,
"step": 6170
},
{
"epoch": 0.16,
"learning_rate": 0.00039393290978129126,
"loss": 1.7059,
"step": 6180
},
{
"epoch": 0.16,
"learning_rate": 0.0003939133595816994,
"loss": 1.7093,
"step": 6190
},
{
"epoch": 0.16,
"learning_rate": 0.00039389377842079986,
"loss": 1.737,
"step": 6200
},
{
"epoch": 0.16,
"learning_rate": 0.00039387416630171904,
"loss": 1.7162,
"step": 6210
},
{
"epoch": 0.16,
"learning_rate": 0.00039385452322758833,
"loss": 1.7225,
"step": 6220
},
{
"epoch": 0.16,
"learning_rate": 0.00039383484920154407,
"loss": 1.73,
"step": 6230
},
{
"epoch": 0.16,
"learning_rate": 0.00039381514422672745,
"loss": 1.7722,
"step": 6240
},
{
"epoch": 0.16,
"learning_rate": 0.0003937954083062848,
"loss": 1.7125,
"step": 6250
},
{
"epoch": 0.16,
"learning_rate": 0.00039377564144336713,
"loss": 1.7239,
"step": 6260
},
{
"epoch": 0.16,
"learning_rate": 0.00039375584364113067,
"loss": 1.7147,
"step": 6270
},
{
"epoch": 0.16,
"learning_rate": 0.0003937360149027364,
"loss": 1.7597,
"step": 6280
},
{
"epoch": 0.16,
"learning_rate": 0.00039371615523135024,
"loss": 1.7395,
"step": 6290
},
{
"epoch": 0.16,
"learning_rate": 0.0003936962646301432,
"loss": 1.6462,
"step": 6300
},
{
"epoch": 0.16,
"learning_rate": 0.0003936763431022909,
"loss": 1.738,
"step": 6310
},
{
"epoch": 0.16,
"learning_rate": 0.00039365639065097445,
"loss": 1.7236,
"step": 6320
},
{
"epoch": 0.16,
"learning_rate": 0.00039363640727937927,
"loss": 1.6844,
"step": 6330
},
{
"epoch": 0.16,
"learning_rate": 0.0003936163929906963,
"loss": 1.7056,
"step": 6340
},
{
"epoch": 0.16,
"learning_rate": 0.00039359634778812086,
"loss": 1.7279,
"step": 6350
},
{
"epoch": 0.16,
"learning_rate": 0.00039357627167485365,
"loss": 1.7891,
"step": 6360
},
{
"epoch": 0.16,
"learning_rate": 0.0003935561646541001,
"loss": 1.8111,
"step": 6370
},
{
"epoch": 0.16,
"learning_rate": 0.00039353602672907067,
"loss": 1.7169,
"step": 6380
},
{
"epoch": 0.16,
"learning_rate": 0.0003935158579029806,
"loss": 1.7333,
"step": 6390
},
{
"epoch": 0.16,
"learning_rate": 0.0003934956581790501,
"loss": 1.7419,
"step": 6400
},
{
"epoch": 0.16,
"learning_rate": 0.00039347542756050453,
"loss": 1.7139,
"step": 6410
},
{
"epoch": 0.16,
"learning_rate": 0.00039345516605057397,
"loss": 1.7196,
"step": 6420
},
{
"epoch": 0.16,
"learning_rate": 0.00039343487365249346,
"loss": 1.6798,
"step": 6430
},
{
"epoch": 0.16,
"learning_rate": 0.0003934145503695031,
"loss": 1.7796,
"step": 6440
},
{
"epoch": 0.16,
"learning_rate": 0.0003933941962048476,
"loss": 1.7407,
"step": 6450
},
{
"epoch": 0.16,
"learning_rate": 0.00039337381116177705,
"loss": 1.68,
"step": 6460
},
{
"epoch": 0.16,
"learning_rate": 0.0003933533952435461,
"loss": 1.7518,
"step": 6470
},
{
"epoch": 0.16,
"learning_rate": 0.00039333294845341453,
"loss": 1.7048,
"step": 6480
},
{
"epoch": 0.17,
"learning_rate": 0.000393312470794647,
"loss": 1.746,
"step": 6490
},
{
"epoch": 0.17,
"learning_rate": 0.0003932919622705131,
"loss": 1.7358,
"step": 6500
},
{
"epoch": 0.17,
"learning_rate": 0.00039327142288428726,
"loss": 1.7378,
"step": 6510
},
{
"epoch": 0.17,
"learning_rate": 0.000393250852639249,
"loss": 1.7156,
"step": 6520
},
{
"epoch": 0.17,
"learning_rate": 0.0003932302515386826,
"loss": 1.7146,
"step": 6530
},
{
"epoch": 0.17,
"learning_rate": 0.00039320961958587745,
"loss": 1.7649,
"step": 6540
},
{
"epoch": 0.17,
"learning_rate": 0.00039318895678412766,
"loss": 1.7608,
"step": 6550
},
{
"epoch": 0.17,
"learning_rate": 0.0003931682631367324,
"loss": 1.6887,
"step": 6560
},
{
"epoch": 0.17,
"learning_rate": 0.0003931475386469958,
"loss": 1.7031,
"step": 6570
},
{
"epoch": 0.17,
"learning_rate": 0.00039312678331822684,
"loss": 1.7654,
"step": 6580
},
{
"epoch": 0.17,
"learning_rate": 0.0003931059971537394,
"loss": 1.7527,
"step": 6590
},
{
"epoch": 0.17,
"learning_rate": 0.00039308518015685227,
"loss": 1.7217,
"step": 6600
},
{
"epoch": 0.17,
"learning_rate": 0.00039306433233088925,
"loss": 1.7602,
"step": 6610
},
{
"epoch": 0.17,
"learning_rate": 0.000393043453679179,
"loss": 1.7707,
"step": 6620
},
{
"epoch": 0.17,
"learning_rate": 0.0003930225442050552,
"loss": 1.7279,
"step": 6630
},
{
"epoch": 0.17,
"learning_rate": 0.00039300160391185637,
"loss": 1.7004,
"step": 6640
},
{
"epoch": 0.17,
"learning_rate": 0.0003929806328029258,
"loss": 1.7322,
"step": 6650
},
{
"epoch": 0.17,
"learning_rate": 0.00039295963088161205,
"loss": 1.7124,
"step": 6660
},
{
"epoch": 0.17,
"learning_rate": 0.00039293859815126833,
"loss": 1.7329,
"step": 6670
},
{
"epoch": 0.17,
"learning_rate": 0.0003929175346152528,
"loss": 1.7658,
"step": 6680
},
{
"epoch": 0.17,
"learning_rate": 0.00039289644027692863,
"loss": 1.6922,
"step": 6690
},
{
"epoch": 0.17,
"learning_rate": 0.00039287531513966387,
"loss": 1.7263,
"step": 6700
},
{
"epoch": 0.17,
"learning_rate": 0.00039285415920683146,
"loss": 1.7444,
"step": 6710
},
{
"epoch": 0.17,
"learning_rate": 0.00039283297248180924,
"loss": 1.7123,
"step": 6720
},
{
"epoch": 0.17,
"learning_rate": 0.00039281175496798013,
"loss": 1.7342,
"step": 6730
},
{
"epoch": 0.17,
"learning_rate": 0.0003927905066687317,
"loss": 1.7511,
"step": 6740
},
{
"epoch": 0.17,
"learning_rate": 0.00039276922758745665,
"loss": 1.6741,
"step": 6750
},
{
"epoch": 0.17,
"learning_rate": 0.0003927479177275525,
"loss": 1.6909,
"step": 6760
},
{
"epoch": 0.17,
"learning_rate": 0.0003927265770924216,
"loss": 1.7151,
"step": 6770
},
{
"epoch": 0.17,
"learning_rate": 0.00039270520568547156,
"loss": 1.7389,
"step": 6780
},
{
"epoch": 0.17,
"learning_rate": 0.00039268380351011446,
"loss": 1.7088,
"step": 6790
},
{
"epoch": 0.17,
"learning_rate": 0.00039266237056976753,
"loss": 1.7261,
"step": 6800
},
{
"epoch": 0.17,
"learning_rate": 0.00039264090686785293,
"loss": 1.7018,
"step": 6810
},
{
"epoch": 0.17,
"learning_rate": 0.00039261941240779756,
"loss": 1.6874,
"step": 6820
},
{
"epoch": 0.17,
"learning_rate": 0.0003925978871930335,
"loss": 1.7276,
"step": 6830
},
{
"epoch": 0.17,
"learning_rate": 0.00039257633122699755,
"loss": 1.7157,
"step": 6840
},
{
"epoch": 0.17,
"learning_rate": 0.0003925547445131313,
"loss": 1.7039,
"step": 6850
},
{
"epoch": 0.17,
"learning_rate": 0.0003925331270548816,
"loss": 1.7528,
"step": 6860
},
{
"epoch": 0.17,
"learning_rate": 0.0003925114788557,
"loss": 1.6982,
"step": 6870
},
{
"epoch": 0.18,
"learning_rate": 0.0003924897999190429,
"loss": 1.7167,
"step": 6880
},
{
"epoch": 0.18,
"learning_rate": 0.00039246809024837164,
"loss": 1.6949,
"step": 6890
},
{
"epoch": 0.18,
"learning_rate": 0.00039244634984715257,
"loss": 1.8046,
"step": 6900
},
{
"epoch": 0.18,
"learning_rate": 0.00039242457871885696,
"loss": 1.7028,
"step": 6910
},
{
"epoch": 0.18,
"learning_rate": 0.0003924027768669608,
"loss": 1.7132,
"step": 6920
},
{
"epoch": 0.18,
"learning_rate": 0.0003923809442949452,
"loss": 1.6916,
"step": 6930
},
{
"epoch": 0.18,
"learning_rate": 0.0003923590810062959,
"loss": 1.7176,
"step": 6940
},
{
"epoch": 0.18,
"learning_rate": 0.00039233718700450393,
"loss": 1.7359,
"step": 6950
},
{
"epoch": 0.18,
"learning_rate": 0.00039231526229306483,
"loss": 1.7271,
"step": 6960
},
{
"epoch": 0.18,
"learning_rate": 0.00039229330687547934,
"loss": 1.6948,
"step": 6970
},
{
"epoch": 0.18,
"learning_rate": 0.00039227132075525295,
"loss": 1.7418,
"step": 6980
},
{
"epoch": 0.18,
"learning_rate": 0.0003922493039358961,
"loss": 1.7207,
"step": 6990
},
{
"epoch": 0.18,
"learning_rate": 0.0003922272564209241,
"loss": 1.6874,
"step": 7000
},
{
"epoch": 0.18,
"learning_rate": 0.00039220517821385715,
"loss": 1.7025,
"step": 7010
},
{
"epoch": 0.18,
"learning_rate": 0.00039218306931822043,
"loss": 1.7748,
"step": 7020
},
{
"epoch": 0.18,
"learning_rate": 0.000392160929737544,
"loss": 1.7008,
"step": 7030
},
{
"epoch": 0.18,
"learning_rate": 0.0003921387594753627,
"loss": 1.6817,
"step": 7040
},
{
"epoch": 0.18,
"grad_norm": 0.1620354801416397,
"learning_rate": 0.0003921165585352165,
"loss": 1.7891,
"step": 7050
},
{
"epoch": 0.18,
"grad_norm": 0.148910254240036,
"learning_rate": 0.00039209432692064995,
"loss": 1.7461,
"step": 7060
},
{
"epoch": 0.18,
"grad_norm": 0.15584000945091248,
"learning_rate": 0.0003920720646352128,
"loss": 1.7547,
"step": 7070
},
{
"epoch": 0.18,
"grad_norm": 0.16131040453910828,
"learning_rate": 0.0003920497716824596,
"loss": 1.7804,
"step": 7080
},
{
"epoch": 0.18,
"grad_norm": 0.1430855393409729,
"learning_rate": 0.0003920274480659496,
"loss": 1.7793,
"step": 7090
},
{
"epoch": 0.18,
"grad_norm": 0.15294766426086426,
"learning_rate": 0.0003920050937892473,
"loss": 1.7156,
"step": 7100
},
{
"epoch": 0.18,
"grad_norm": 0.13739879429340363,
"learning_rate": 0.00039198270885592174,
"loss": 1.7659,
"step": 7110
},
{
"epoch": 0.18,
"grad_norm": 0.1557493805885315,
"learning_rate": 0.0003919602932695472,
"loss": 1.7918,
"step": 7120
},
{
"epoch": 0.18,
"grad_norm": 0.12877637147903442,
"learning_rate": 0.00039193784703370264,
"loss": 1.7088,
"step": 7130
},
{
"epoch": 0.18,
"grad_norm": 0.13669374585151672,
"learning_rate": 0.00039191537015197185,
"loss": 1.7093,
"step": 7140
},
{
"epoch": 0.18,
"grad_norm": 0.12947019934654236,
"learning_rate": 0.00039189286262794376,
"loss": 1.6805,
"step": 7150
},
{
"epoch": 0.18,
"grad_norm": 0.14283278584480286,
"learning_rate": 0.0003918703244652119,
"loss": 1.6415,
"step": 7160
},
{
"epoch": 0.18,
"grad_norm": 0.13801805675029755,
"learning_rate": 0.00039184775566737494,
"loss": 1.692,
"step": 7170
},
{
"epoch": 0.18,
"grad_norm": 0.16947269439697266,
"learning_rate": 0.0003918251562380363,
"loss": 1.7506,
"step": 7180
},
{
"epoch": 0.18,
"grad_norm": 0.13640892505645752,
"learning_rate": 0.0003918025261808043,
"loss": 1.6777,
"step": 7190
},
{
"epoch": 0.18,
"grad_norm": 0.1325259953737259,
"learning_rate": 0.0003917798654992923,
"loss": 1.6815,
"step": 7200
},
{
"epoch": 0.18,
"grad_norm": 0.13371969759464264,
"learning_rate": 0.00039175717419711833,
"loss": 1.6929,
"step": 7210
},
{
"epoch": 0.18,
"grad_norm": 0.13861477375030518,
"learning_rate": 0.0003917344522779054,
"loss": 1.7506,
"step": 7220
},
{
"epoch": 0.18,
"grad_norm": 0.14447219669818878,
"learning_rate": 0.00039171169974528157,
"loss": 1.7032,
"step": 7230
},
{
"epoch": 0.18,
"grad_norm": 0.13237527012825012,
"learning_rate": 0.0003916889166028795,
"loss": 1.6761,
"step": 7240
},
{
"epoch": 0.18,
"grad_norm": 0.1482444852590561,
"learning_rate": 0.00039166610285433685,
"loss": 1.711,
"step": 7250
},
{
"epoch": 0.18,
"grad_norm": 0.13787992298603058,
"learning_rate": 0.0003916432585032963,
"loss": 1.7039,
"step": 7260
},
{
"epoch": 0.18,
"grad_norm": 0.13764281570911407,
"learning_rate": 0.0003916203835534052,
"loss": 1.6459,
"step": 7270
},
{
"epoch": 0.19,
"grad_norm": 0.14277833700180054,
"learning_rate": 0.000391597478008316,
"loss": 1.7331,
"step": 7280
},
{
"epoch": 0.19,
"grad_norm": 0.14394375681877136,
"learning_rate": 0.0003915745418716859,
"loss": 1.7245,
"step": 7290
},
{
"epoch": 0.19,
"grad_norm": 0.12185429036617279,
"learning_rate": 0.00039155157514717703,
"loss": 1.6878,
"step": 7300
},
{
"epoch": 0.19,
"grad_norm": 0.12890471518039703,
"learning_rate": 0.0003915285778384563,
"loss": 1.6844,
"step": 7310
},
{
"epoch": 0.19,
"grad_norm": 0.13018891215324402,
"learning_rate": 0.0003915055499491957,
"loss": 1.6959,
"step": 7320
},
{
"epoch": 0.19,
"grad_norm": 0.169634148478508,
"learning_rate": 0.0003914824914830719,
"loss": 1.7361,
"step": 7330
},
{
"epoch": 0.19,
"grad_norm": 0.1414484977722168,
"learning_rate": 0.00039145940244376655,
"loss": 1.7753,
"step": 7340
},
{
"epoch": 0.19,
"grad_norm": 0.14286017417907715,
"learning_rate": 0.0003914362828349663,
"loss": 1.6969,
"step": 7350
},
{
"epoch": 0.19,
"grad_norm": 0.13231833279132843,
"learning_rate": 0.0003914131326603624,
"loss": 1.7277,
"step": 7360
},
{
"epoch": 0.19,
"grad_norm": 0.14872434735298157,
"learning_rate": 0.00039138995192365125,
"loss": 1.736,
"step": 7370
},
{
"epoch": 0.19,
"grad_norm": 0.1488579511642456,
"learning_rate": 0.000391366740628534,
"loss": 1.6918,
"step": 7380
},
{
"epoch": 0.19,
"grad_norm": 0.1341143697500229,
"learning_rate": 0.0003913434987787166,
"loss": 1.6209,
"step": 7390
},
{
"epoch": 0.19,
"grad_norm": 0.1347276121377945,
"learning_rate": 0.0003913202263779101,
"loss": 1.6827,
"step": 7400
},
{
"epoch": 0.19,
"grad_norm": 0.12921172380447388,
"learning_rate": 0.00039129692342983023,
"loss": 1.6474,
"step": 7410
},
{
"epoch": 0.19,
"grad_norm": 0.1317102164030075,
"learning_rate": 0.00039127358993819777,
"loss": 1.6786,
"step": 7420
},
{
"epoch": 0.19,
"grad_norm": 0.14124788343906403,
"learning_rate": 0.0003912502259067381,
"loss": 1.764,
"step": 7430
},
{
"epoch": 0.19,
"grad_norm": 0.14556260406970978,
"learning_rate": 0.0003912268313391818,
"loss": 1.701,
"step": 7440
},
{
"epoch": 0.19,
"grad_norm": 0.13149979710578918,
"learning_rate": 0.0003912034062392642,
"loss": 1.7006,
"step": 7450
},
{
"epoch": 0.19,
"grad_norm": 0.13681216537952423,
"learning_rate": 0.0003911799506107253,
"loss": 1.706,
"step": 7460
},
{
"epoch": 0.19,
"grad_norm": 0.13194845616817474,
"learning_rate": 0.0003911564644573103,
"loss": 1.7716,
"step": 7470
},
{
"epoch": 0.19,
"grad_norm": 0.13681434094905853,
"learning_rate": 0.0003911329477827692,
"loss": 1.7356,
"step": 7480
},
{
"epoch": 0.19,
"grad_norm": 0.1333349198102951,
"learning_rate": 0.00039110940059085665,
"loss": 1.7158,
"step": 7490
},
{
"epoch": 0.19,
"grad_norm": 0.13811297714710236,
"learning_rate": 0.0003910858228853324,
"loss": 1.7472,
"step": 7500
},
{
"epoch": 0.19,
"grad_norm": 0.1425902247428894,
"learning_rate": 0.000391062214669961,
"loss": 1.7062,
"step": 7510
},
{
"epoch": 0.19,
"grad_norm": 0.13320201635360718,
"learning_rate": 0.0003910385759485119,
"loss": 1.6833,
"step": 7520
},
{
"epoch": 0.19,
"grad_norm": 0.13622677326202393,
"learning_rate": 0.0003910149067247593,
"loss": 1.7262,
"step": 7530
},
{
"epoch": 0.19,
"grad_norm": 0.13262036442756653,
"learning_rate": 0.00039099120700248247,
"loss": 1.68,
"step": 7540
},
{
"epoch": 0.19,
"grad_norm": 0.13869726657867432,
"learning_rate": 0.00039096747678546537,
"loss": 1.6351,
"step": 7550
},
{
"epoch": 0.19,
"grad_norm": 0.13586406409740448,
"learning_rate": 0.000390943716077497,
"loss": 1.7085,
"step": 7560
},
{
"epoch": 0.19,
"grad_norm": 0.13860741257667542,
"learning_rate": 0.00039091992488237093,
"loss": 1.7097,
"step": 7570
},
{
"epoch": 0.19,
"grad_norm": 0.14486007392406464,
"learning_rate": 0.00039089610320388604,
"loss": 1.711,
"step": 7580
},
{
"epoch": 0.19,
"grad_norm": 0.14704862236976624,
"learning_rate": 0.00039087225104584563,
"loss": 1.6913,
"step": 7590
},
{
"epoch": 0.19,
"grad_norm": 0.15811263024806976,
"learning_rate": 0.0003908483684120582,
"loss": 1.6907,
"step": 7600
},
{
"epoch": 0.19,
"grad_norm": 0.13361206650733948,
"learning_rate": 0.0003908244553063369,
"loss": 1.6782,
"step": 7610
},
{
"epoch": 0.19,
"grad_norm": 0.13869741559028625,
"learning_rate": 0.0003908005117324999,
"loss": 1.7114,
"step": 7620
},
{
"epoch": 0.19,
"grad_norm": 0.12840087711811066,
"learning_rate": 0.0003907765376943702,
"loss": 1.6481,
"step": 7630
},
{
"epoch": 0.19,
"grad_norm": 0.14002574980258942,
"learning_rate": 0.0003907525331957755,
"loss": 1.6603,
"step": 7640
},
{
"epoch": 0.19,
"grad_norm": 0.14109046757221222,
"learning_rate": 0.00039072849824054866,
"loss": 1.7,
"step": 7650
},
{
"epoch": 0.19,
"grad_norm": 0.12924519181251526,
"learning_rate": 0.0003907044328325271,
"loss": 1.6869,
"step": 7660
},
{
"epoch": 0.2,
"grad_norm": 0.13750408589839935,
"learning_rate": 0.00039068033697555333,
"loss": 1.6731,
"step": 7670
},
{
"epoch": 0.2,
"grad_norm": 0.16169866919517517,
"learning_rate": 0.0003906562106734745,
"loss": 1.7461,
"step": 7680
},
{
"epoch": 0.2,
"grad_norm": 0.13251982629299164,
"learning_rate": 0.00039063205393014287,
"loss": 1.6876,
"step": 7690
},
{
"epoch": 0.2,
"grad_norm": 0.15520523488521576,
"learning_rate": 0.0003906078667494154,
"loss": 1.7065,
"step": 7700
},
{
"epoch": 0.2,
"grad_norm": 0.12816987931728363,
"learning_rate": 0.000390583649135154,
"loss": 1.654,
"step": 7710
},
{
"epoch": 0.2,
"grad_norm": 0.13748766481876373,
"learning_rate": 0.00039055940109122535,
"loss": 1.7374,
"step": 7720
},
{
"epoch": 0.2,
"grad_norm": 0.14492934942245483,
"learning_rate": 0.0003905351226215011,
"loss": 1.732,
"step": 7730
},
{
"epoch": 0.2,
"grad_norm": 0.1277233511209488,
"learning_rate": 0.0003905108137298575,
"loss": 1.685,
"step": 7740
},
{
"epoch": 0.2,
"grad_norm": 0.13751693069934845,
"learning_rate": 0.00039048647442017605,
"loss": 1.7166,
"step": 7750
},
{
"epoch": 0.2,
"grad_norm": 0.2509312927722931,
"learning_rate": 0.00039046210469634274,
"loss": 2.0125,
"step": 7760
},
{
"epoch": 0.2,
"grad_norm": 0.14716836810112,
"learning_rate": 0.00039043770456224876,
"loss": 1.864,
"step": 7770
},
{
"epoch": 0.2,
"grad_norm": 0.13424529135227203,
"learning_rate": 0.00039041327402178984,
"loss": 1.72,
"step": 7780
},
{
"epoch": 0.2,
"grad_norm": 0.12159378826618195,
"learning_rate": 0.00039038881307886674,
"loss": 1.7137,
"step": 7790
},
{
"epoch": 0.2,
"grad_norm": 0.12906280159950256,
"learning_rate": 0.00039036432173738503,
"loss": 1.7289,
"step": 7800
},
{
"epoch": 0.2,
"grad_norm": 0.1405516117811203,
"learning_rate": 0.00039033980000125515,
"loss": 1.7146,
"step": 7810
},
{
"epoch": 0.2,
"grad_norm": 0.1447688192129135,
"learning_rate": 0.00039031524787439236,
"loss": 1.7115,
"step": 7820
},
{
"epoch": 0.2,
"grad_norm": 0.13648471236228943,
"learning_rate": 0.00039029066536071683,
"loss": 1.7023,
"step": 7830
},
{
"epoch": 0.2,
"grad_norm": 0.1418386846780777,
"learning_rate": 0.0003902660524641534,
"loss": 1.7176,
"step": 7840
},
{
"epoch": 0.2,
"grad_norm": 0.12786869704723358,
"learning_rate": 0.00039024140918863214,
"loss": 1.6776,
"step": 7850
},
{
"epoch": 0.2,
"grad_norm": 0.12492866069078445,
"learning_rate": 0.00039021673553808756,
"loss": 1.661,
"step": 7860
},
{
"epoch": 0.2,
"grad_norm": 0.12730900943279266,
"learning_rate": 0.0003901920315164592,
"loss": 1.7203,
"step": 7870
},
{
"epoch": 0.2,
"grad_norm": 0.13647036254405975,
"learning_rate": 0.00039016729712769156,
"loss": 1.7012,
"step": 7880
},
{
"epoch": 0.2,
"grad_norm": 0.14049872756004333,
"learning_rate": 0.0003901425323757337,
"loss": 1.6655,
"step": 7890
},
{
"epoch": 0.2,
"grad_norm": 0.11917974799871445,
"learning_rate": 0.00039011773726453994,
"loss": 1.6759,
"step": 7900
},
{
"epoch": 0.2,
"grad_norm": 0.13427309691905975,
"learning_rate": 0.000390092911798069,
"loss": 1.6681,
"step": 7910
},
{
"epoch": 0.2,
"grad_norm": 0.13851284980773926,
"learning_rate": 0.00039006805598028473,
"loss": 1.7163,
"step": 7920
},
{
"epoch": 0.2,
"grad_norm": 0.1267043501138687,
"learning_rate": 0.0003900431698151557,
"loss": 1.7098,
"step": 7930
},
{
"epoch": 0.2,
"grad_norm": 0.22532662749290466,
"learning_rate": 0.0003900182533066555,
"loss": 1.6815,
"step": 7940
},
{
"epoch": 0.2,
"grad_norm": 0.14643071591854095,
"learning_rate": 0.00038999330645876233,
"loss": 1.7363,
"step": 7950
},
{
"epoch": 0.2,
"grad_norm": 0.15773718059062958,
"learning_rate": 0.0003899683292754594,
"loss": 1.7213,
"step": 7960
},
{
"epoch": 0.2,
"grad_norm": 0.1535635143518448,
"learning_rate": 0.00038994332176073466,
"loss": 1.6854,
"step": 7970
},
{
"epoch": 0.2,
"grad_norm": 0.12380396574735641,
"learning_rate": 0.00038991828391858103,
"loss": 1.6882,
"step": 7980
},
{
"epoch": 0.2,
"grad_norm": 0.13407306373119354,
"learning_rate": 0.00038989321575299613,
"loss": 1.7648,
"step": 7990
},
{
"epoch": 0.2,
"grad_norm": 0.13147538900375366,
"learning_rate": 0.00038986811726798246,
"loss": 1.7537,
"step": 8000
},
{
"epoch": 0.2,
"grad_norm": 0.13497044146060944,
"learning_rate": 0.00038984298846754745,
"loss": 1.6978,
"step": 8010
},
{
"epoch": 0.2,
"grad_norm": 0.13441677391529083,
"learning_rate": 0.0003898178293557033,
"loss": 1.6786,
"step": 8020
},
{
"epoch": 0.2,
"grad_norm": 0.13331085443496704,
"learning_rate": 0.000389792639936467,
"loss": 1.6941,
"step": 8030
},
{
"epoch": 0.2,
"grad_norm": 0.1468220353126526,
"learning_rate": 0.0003897674202138605,
"loss": 1.6199,
"step": 8040
},
{
"epoch": 0.2,
"grad_norm": 0.1379866600036621,
"learning_rate": 0.00038974217019191053,
"loss": 1.7195,
"step": 8050
},
{
"epoch": 0.21,
"grad_norm": 0.13483931124210358,
"learning_rate": 0.0003897168898746486,
"loss": 1.6968,
"step": 8060
},
{
"epoch": 0.21,
"grad_norm": 0.13262777030467987,
"learning_rate": 0.0003896915792661111,
"loss": 1.6964,
"step": 8070
},
{
"epoch": 0.21,
"grad_norm": 0.12962651252746582,
"learning_rate": 0.00038966623837033936,
"loss": 1.7032,
"step": 8080
},
{
"epoch": 0.21,
"grad_norm": 0.13349364697933197,
"learning_rate": 0.0003896408671913793,
"loss": 1.6942,
"step": 8090
},
{
"epoch": 0.21,
"grad_norm": 0.12977685034275055,
"learning_rate": 0.000389615465733282,
"loss": 1.7319,
"step": 8100
},
{
"epoch": 0.21,
"grad_norm": 0.1472356915473938,
"learning_rate": 0.0003895900340001031,
"loss": 1.6924,
"step": 8110
},
{
"epoch": 0.21,
"grad_norm": 0.1957908570766449,
"learning_rate": 0.0003895645719959032,
"loss": 1.6861,
"step": 8120
},
{
"epoch": 0.21,
"grad_norm": 0.14227932691574097,
"learning_rate": 0.00038953907972474764,
"loss": 1.7058,
"step": 8130
},
{
"epoch": 0.21,
"grad_norm": 0.12813404202461243,
"learning_rate": 0.00038951355719070674,
"loss": 1.6742,
"step": 8140
},
{
"epoch": 0.21,
"grad_norm": 0.1380937248468399,
"learning_rate": 0.00038948800439785557,
"loss": 1.7324,
"step": 8150
},
{
"epoch": 0.21,
"grad_norm": 0.13374464213848114,
"learning_rate": 0.00038946242135027404,
"loss": 1.6414,
"step": 8160
},
{
"epoch": 0.21,
"grad_norm": 0.13308840990066528,
"learning_rate": 0.0003894368080520468,
"loss": 1.6614,
"step": 8170
},
{
"epoch": 0.21,
"grad_norm": 0.13562729954719543,
"learning_rate": 0.00038941116450726354,
"loss": 1.6515,
"step": 8180
},
{
"epoch": 0.21,
"grad_norm": 0.13918358087539673,
"learning_rate": 0.0003893854907200185,
"loss": 1.6598,
"step": 8190
},
{
"epoch": 0.21,
"grad_norm": 0.15010669827461243,
"learning_rate": 0.00038935978669441104,
"loss": 1.7217,
"step": 8200
},
{
"epoch": 0.21,
"grad_norm": 0.1475742906332016,
"learning_rate": 0.0003893340524345452,
"loss": 1.6392,
"step": 8210
},
{
"epoch": 0.21,
"grad_norm": 0.1437012404203415,
"learning_rate": 0.00038930828794452976,
"loss": 1.7013,
"step": 8220
},
{
"epoch": 0.21,
"grad_norm": 0.14719869196414948,
"learning_rate": 0.00038928249322847853,
"loss": 1.7308,
"step": 8230
},
{
"epoch": 0.21,
"grad_norm": 0.16744408011436462,
"learning_rate": 0.00038925666829051,
"loss": 1.7185,
"step": 8240
},
{
"epoch": 0.21,
"grad_norm": 0.1367211937904358,
"learning_rate": 0.0003892308131347475,
"loss": 1.7045,
"step": 8250
},
{
"epoch": 0.21,
"grad_norm": 0.13218046724796295,
"learning_rate": 0.00038920492776531925,
"loss": 1.6705,
"step": 8260
},
{
"epoch": 0.21,
"grad_norm": 0.13375000655651093,
"learning_rate": 0.0003891790121863582,
"loss": 1.6399,
"step": 8270
},
{
"epoch": 0.21,
"grad_norm": 0.13972359895706177,
"learning_rate": 0.00038915306640200216,
"loss": 1.6861,
"step": 8280
},
{
"epoch": 0.21,
"grad_norm": 0.13885724544525146,
"learning_rate": 0.00038912709041639395,
"loss": 1.6812,
"step": 8290
},
{
"epoch": 0.21,
"grad_norm": 0.14410728216171265,
"learning_rate": 0.0003891010842336809,
"loss": 1.7016,
"step": 8300
},
{
"epoch": 0.21,
"grad_norm": 0.13372208178043365,
"learning_rate": 0.0003890750478580153,
"loss": 1.6539,
"step": 8310
},
{
"epoch": 0.21,
"grad_norm": 0.133956640958786,
"learning_rate": 0.00038904898129355435,
"loss": 1.6481,
"step": 8320
},
{
"epoch": 0.21,
"grad_norm": 0.14632944762706757,
"learning_rate": 0.00038902288454445997,
"loss": 1.6756,
"step": 8330
},
{
"epoch": 0.21,
"grad_norm": 0.1364748626947403,
"learning_rate": 0.0003889967576148988,
"loss": 1.6624,
"step": 8340
},
{
"epoch": 0.21,
"grad_norm": 0.13148286938667297,
"learning_rate": 0.0003889706005090425,
"loss": 1.6931,
"step": 8350
},
{
"epoch": 0.21,
"grad_norm": 0.14017271995544434,
"learning_rate": 0.0003889444132310675,
"loss": 1.7128,
"step": 8360
},
{
"epoch": 0.21,
"grad_norm": 0.1335548460483551,
"learning_rate": 0.00038891819578515494,
"loss": 1.7638,
"step": 8370
},
{
"epoch": 0.21,
"grad_norm": 0.13582909107208252,
"learning_rate": 0.00038889194817549085,
"loss": 1.6896,
"step": 8380
},
{
"epoch": 0.21,
"grad_norm": 0.1565767079591751,
"learning_rate": 0.00038886567040626616,
"loss": 1.7231,
"step": 8390
},
{
"epoch": 0.21,
"grad_norm": 0.15570221841335297,
"learning_rate": 0.0003888393624816764,
"loss": 1.6965,
"step": 8400
},
{
"epoch": 0.21,
"grad_norm": 0.14114753901958466,
"learning_rate": 0.0003888130244059221,
"loss": 1.6967,
"step": 8410
},
{
"epoch": 0.21,
"grad_norm": 0.13201847672462463,
"learning_rate": 0.00038878665618320864,
"loss": 1.6502,
"step": 8420
},
{
"epoch": 0.21,
"grad_norm": 0.13043740391731262,
"learning_rate": 0.00038876025781774603,
"loss": 1.7026,
"step": 8430
},
{
"epoch": 0.21,
"grad_norm": 0.16153882443904877,
"learning_rate": 0.0003887338293137491,
"loss": 1.6899,
"step": 8440
},
{
"epoch": 0.21,
"grad_norm": 0.12798282504081726,
"learning_rate": 0.0003887073706754377,
"loss": 1.6431,
"step": 8450
},
{
"epoch": 0.22,
"grad_norm": 0.13147814571857452,
"learning_rate": 0.0003886808819070363,
"loss": 1.6663,
"step": 8460
},
{
"epoch": 0.22,
"grad_norm": 0.13304992020130157,
"learning_rate": 0.0003886543630127743,
"loss": 1.6965,
"step": 8470
},
{
"epoch": 0.22,
"grad_norm": 0.13692429661750793,
"learning_rate": 0.00038862781399688585,
"loss": 1.6628,
"step": 8480
},
{
"epoch": 0.22,
"grad_norm": 0.22576695680618286,
"learning_rate": 0.00038860123486360987,
"loss": 1.6373,
"step": 8490
},
{
"epoch": 0.22,
"grad_norm": 0.15108773112297058,
"learning_rate": 0.0003885746256171902,
"loss": 1.655,
"step": 8500
},
{
"epoch": 0.22,
"grad_norm": 0.1294201761484146,
"learning_rate": 0.0003885479862618754,
"loss": 1.6525,
"step": 8510
},
{
"epoch": 0.22,
"grad_norm": 0.12410447001457214,
"learning_rate": 0.00038852131680191875,
"loss": 1.6522,
"step": 8520
},
{
"epoch": 0.22,
"grad_norm": 0.1444026529788971,
"learning_rate": 0.0003884946172415786,
"loss": 1.6696,
"step": 8530
},
{
"epoch": 0.22,
"grad_norm": 0.134349063038826,
"learning_rate": 0.00038846788758511785,
"loss": 1.6533,
"step": 8540
},
{
"epoch": 0.22,
"grad_norm": 0.12954683601856232,
"learning_rate": 0.00038844112783680446,
"loss": 1.6767,
"step": 8550
},
{
"epoch": 0.22,
"grad_norm": 0.13951031863689423,
"learning_rate": 0.00038841433800091085,
"loss": 1.7252,
"step": 8560
},
{
"epoch": 0.22,
"grad_norm": 0.14629290997982025,
"learning_rate": 0.0003883875180817146,
"loss": 1.6573,
"step": 8570
},
{
"epoch": 0.22,
"grad_norm": 0.15033113956451416,
"learning_rate": 0.00038836066808349784,
"loss": 1.7045,
"step": 8580
},
{
"epoch": 0.22,
"grad_norm": 0.13196027278900146,
"learning_rate": 0.00038833378801054753,
"loss": 1.7253,
"step": 8590
},
{
"epoch": 0.22,
"grad_norm": 0.1399390995502472,
"learning_rate": 0.00038830687786715564,
"loss": 1.6248,
"step": 8600
},
{
"epoch": 0.22,
"grad_norm": 0.12518121302127838,
"learning_rate": 0.00038827993765761876,
"loss": 1.6735,
"step": 8610
},
{
"epoch": 0.22,
"grad_norm": 0.13088391721248627,
"learning_rate": 0.00038825296738623824,
"loss": 1.6529,
"step": 8620
},
{
"epoch": 0.22,
"grad_norm": 0.1249965950846672,
"learning_rate": 0.00038822596705732036,
"loss": 1.7086,
"step": 8630
},
{
"epoch": 0.22,
"grad_norm": 0.13543793559074402,
"learning_rate": 0.00038819893667517614,
"loss": 1.6193,
"step": 8640
},
{
"epoch": 0.22,
"grad_norm": 0.16318467259407043,
"learning_rate": 0.00038817187624412146,
"loss": 1.7019,
"step": 8650
},
{
"epoch": 0.22,
"grad_norm": 0.1324823647737503,
"learning_rate": 0.00038814478576847686,
"loss": 1.7179,
"step": 8660
},
{
"epoch": 0.22,
"grad_norm": 0.1390790194272995,
"learning_rate": 0.0003881176652525677,
"loss": 1.6679,
"step": 8670
},
{
"epoch": 0.22,
"grad_norm": 0.12248550355434418,
"learning_rate": 0.00038809051470072443,
"loss": 1.6866,
"step": 8680
},
{
"epoch": 0.22,
"grad_norm": 0.14330117404460907,
"learning_rate": 0.0003880633341172818,
"loss": 1.6583,
"step": 8690
},
{
"epoch": 0.22,
"grad_norm": 0.12497661262750626,
"learning_rate": 0.00038803612350657977,
"loss": 1.6371,
"step": 8700
},
{
"epoch": 0.22,
"grad_norm": 0.14796240627765656,
"learning_rate": 0.0003880088828729629,
"loss": 1.669,
"step": 8710
},
{
"epoch": 0.22,
"grad_norm": 0.198830246925354,
"learning_rate": 0.00038798161222078055,
"loss": 1.6904,
"step": 8720
},
{
"epoch": 0.22,
"grad_norm": 0.1437385082244873,
"learning_rate": 0.00038795431155438703,
"loss": 1.6763,
"step": 8730
},
{
"epoch": 0.22,
"grad_norm": 0.1363937258720398,
"learning_rate": 0.00038792698087814116,
"loss": 1.6551,
"step": 8740
},
{
"epoch": 0.22,
"grad_norm": 0.13170404732227325,
"learning_rate": 0.0003878996201964068,
"loss": 1.6615,
"step": 8750
},
{
"epoch": 0.22,
"grad_norm": 0.12872740626335144,
"learning_rate": 0.00038787222951355247,
"loss": 1.6544,
"step": 8760
},
{
"epoch": 0.22,
"grad_norm": 0.15284587442874908,
"learning_rate": 0.0003878448088339516,
"loss": 1.6436,
"step": 8770
},
{
"epoch": 0.22,
"grad_norm": 0.14942800998687744,
"learning_rate": 0.0003878173581619822,
"loss": 1.6953,
"step": 8780
},
{
"epoch": 0.22,
"grad_norm": 0.1378229409456253,
"learning_rate": 0.00038778987750202735,
"loss": 1.7536,
"step": 8790
},
{
"epoch": 0.22,
"grad_norm": 0.13812203705310822,
"learning_rate": 0.0003877623668584747,
"loss": 1.7113,
"step": 8800
},
{
"epoch": 0.22,
"grad_norm": 0.1441105157136917,
"learning_rate": 0.0003877348262357167,
"loss": 1.7004,
"step": 8810
},
{
"epoch": 0.22,
"grad_norm": 0.1562652587890625,
"learning_rate": 0.0003877072556381507,
"loss": 1.7129,
"step": 8820
},
{
"epoch": 0.22,
"grad_norm": 0.13454972207546234,
"learning_rate": 0.0003876796550701788,
"loss": 1.6665,
"step": 8830
},
{
"epoch": 0.22,
"grad_norm": 0.14589405059814453,
"learning_rate": 0.00038765202453620775,
"loss": 1.6646,
"step": 8840
},
{
"epoch": 0.23,
"grad_norm": 0.13198499381542206,
"learning_rate": 0.0003876243640406494,
"loss": 1.6491,
"step": 8850
},
{
"epoch": 0.23,
"grad_norm": 0.16170230507850647,
"learning_rate": 0.00038759667358792,
"loss": 1.6992,
"step": 8860
},
{
"epoch": 0.23,
"grad_norm": 0.12688037753105164,
"learning_rate": 0.00038756895318244083,
"loss": 1.651,
"step": 8870
},
{
"epoch": 0.23,
"grad_norm": 0.1396840214729309,
"learning_rate": 0.0003875412028286379,
"loss": 1.6469,
"step": 8880
},
{
"epoch": 0.23,
"grad_norm": 0.13911014795303345,
"learning_rate": 0.00038751342253094196,
"loss": 1.6992,
"step": 8890
},
{
"epoch": 0.23,
"grad_norm": 0.1247856467962265,
"learning_rate": 0.0003874856122937886,
"loss": 1.6641,
"step": 8900
},
{
"epoch": 0.23,
"grad_norm": 0.1389637291431427,
"learning_rate": 0.0003874577721216182,
"loss": 1.66,
"step": 8910
},
{
"epoch": 0.23,
"grad_norm": 0.11789216846227646,
"learning_rate": 0.0003874299020188757,
"loss": 1.6096,
"step": 8920
},
{
"epoch": 0.23,
"grad_norm": 0.1422717571258545,
"learning_rate": 0.0003874020019900112,
"loss": 1.6885,
"step": 8930
},
{
"epoch": 0.23,
"grad_norm": 0.13758403062820435,
"learning_rate": 0.0003873740720394793,
"loss": 1.6719,
"step": 8940
},
{
"epoch": 0.23,
"grad_norm": 0.1851162314414978,
"learning_rate": 0.00038734611217173945,
"loss": 1.841,
"step": 8950
},
{
"epoch": 0.23,
"grad_norm": 0.16305282711982727,
"learning_rate": 0.00038731812239125587,
"loss": 1.6986,
"step": 8960
},
{
"epoch": 0.23,
"grad_norm": 0.1490446776151657,
"learning_rate": 0.00038729010270249764,
"loss": 1.7077,
"step": 8970
},
{
"epoch": 0.23,
"grad_norm": 0.13517969846725464,
"learning_rate": 0.0003872620531099384,
"loss": 1.6802,
"step": 8980
},
{
"epoch": 0.23,
"grad_norm": 0.13483189046382904,
"learning_rate": 0.00038723397361805685,
"loss": 1.6995,
"step": 8990
},
{
"epoch": 0.23,
"grad_norm": 0.13785387575626373,
"learning_rate": 0.0003872058642313363,
"loss": 1.7246,
"step": 9000
},
{
"epoch": 0.23,
"grad_norm": 0.127422034740448,
"learning_rate": 0.00038717772495426475,
"loss": 1.6759,
"step": 9010
},
{
"epoch": 0.23,
"grad_norm": 0.14029677212238312,
"learning_rate": 0.00038714955579133514,
"loss": 1.691,
"step": 9020
},
{
"epoch": 0.23,
"grad_norm": 0.1330009549856186,
"learning_rate": 0.0003871213567470452,
"loss": 1.6435,
"step": 9030
},
{
"epoch": 0.23,
"grad_norm": 0.12364694476127625,
"learning_rate": 0.0003870931278258972,
"loss": 1.6694,
"step": 9040
},
{
"epoch": 0.23,
"grad_norm": 0.13656748831272125,
"learning_rate": 0.0003870648690323984,
"loss": 1.7141,
"step": 9050
},
{
"epoch": 0.23,
"grad_norm": 0.1401910036802292,
"learning_rate": 0.00038703658037106083,
"loss": 1.6992,
"step": 9060
},
{
"epoch": 0.23,
"grad_norm": 0.13535091280937195,
"learning_rate": 0.0003870082618464011,
"loss": 1.6243,
"step": 9070
},
{
"epoch": 0.23,
"grad_norm": 0.13311734795570374,
"learning_rate": 0.0003869799134629408,
"loss": 1.6122,
"step": 9080
},
{
"epoch": 0.23,
"grad_norm": 0.12910255789756775,
"learning_rate": 0.0003869515352252061,
"loss": 1.7029,
"step": 9090
},
{
"epoch": 0.23,
"grad_norm": 0.12881791591644287,
"learning_rate": 0.0003869231271377281,
"loss": 1.6627,
"step": 9100
},
{
"epoch": 0.23,
"grad_norm": 0.12921388447284698,
"learning_rate": 0.0003868946892050426,
"loss": 1.6297,
"step": 9110
},
{
"epoch": 0.23,
"grad_norm": 0.12428681552410126,
"learning_rate": 0.0003868662214316901,
"loss": 1.6419,
"step": 9120
},
{
"epoch": 0.23,
"grad_norm": 0.15118862688541412,
"learning_rate": 0.000386837723822216,
"loss": 1.7649,
"step": 9130
},
{
"epoch": 0.23,
"grad_norm": 0.1526978313922882,
"learning_rate": 0.00038680919638117033,
"loss": 1.6654,
"step": 9140
},
{
"epoch": 0.23,
"grad_norm": 0.15959547460079193,
"learning_rate": 0.00038678063911310796,
"loss": 1.6896,
"step": 9150
},
{
"epoch": 0.23,
"grad_norm": 0.14070424437522888,
"learning_rate": 0.0003867520520225886,
"loss": 1.6588,
"step": 9160
},
{
"epoch": 0.23,
"grad_norm": 0.15812811255455017,
"learning_rate": 0.00038672343511417646,
"loss": 1.6973,
"step": 9170
},
{
"epoch": 0.23,
"grad_norm": 0.1548108160495758,
"learning_rate": 0.0003866947883924408,
"loss": 1.6617,
"step": 9180
},
{
"epoch": 0.23,
"grad_norm": 0.1341230422258377,
"learning_rate": 0.0003866661118619554,
"loss": 1.7306,
"step": 9190
},
{
"epoch": 0.23,
"grad_norm": 0.13387803733348846,
"learning_rate": 0.0003866374055272991,
"loss": 1.6905,
"step": 9200
},
{
"epoch": 0.23,
"grad_norm": 0.17781074345111847,
"learning_rate": 0.0003866086693930552,
"loss": 1.6869,
"step": 9210
},
{
"epoch": 0.23,
"grad_norm": 0.13148914277553558,
"learning_rate": 0.0003865799034638118,
"loss": 1.6822,
"step": 9220
},
{
"epoch": 0.23,
"grad_norm": 0.12276733666658401,
"learning_rate": 0.00038655110774416196,
"loss": 1.7114,
"step": 9230
},
{
"epoch": 0.24,
"grad_norm": 0.1315593272447586,
"learning_rate": 0.00038652228223870337,
"loss": 1.683,
"step": 9240
},
{
"epoch": 0.24,
"grad_norm": 0.13443982601165771,
"learning_rate": 0.0003864934269520384,
"loss": 1.6736,
"step": 9250
},
{
"epoch": 0.24,
"grad_norm": 0.132383331656456,
"learning_rate": 0.0003864645418887743,
"loss": 1.6831,
"step": 9260
},
{
"epoch": 0.24,
"grad_norm": 0.12144575268030167,
"learning_rate": 0.00038643562705352295,
"loss": 1.6697,
"step": 9270
},
{
"epoch": 0.24,
"grad_norm": 0.14845208823680878,
"learning_rate": 0.0003864066824509012,
"loss": 1.6565,
"step": 9280
},
{
"epoch": 0.24,
"grad_norm": 0.13534559309482574,
"learning_rate": 0.0003863777080855303,
"loss": 1.6132,
"step": 9290
},
{
"epoch": 0.24,
"grad_norm": 0.12392674386501312,
"learning_rate": 0.0003863487039620366,
"loss": 1.7073,
"step": 9300
},
{
"epoch": 0.24,
"grad_norm": 0.12857109308242798,
"learning_rate": 0.0003863196700850511,
"loss": 1.6741,
"step": 9310
},
{
"epoch": 0.24,
"grad_norm": 0.13830944895744324,
"learning_rate": 0.0003862906064592094,
"loss": 1.6617,
"step": 9320
},
{
"epoch": 0.24,
"grad_norm": 0.13779057562351227,
"learning_rate": 0.0003862615130891521,
"loss": 1.6576,
"step": 9330
},
{
"epoch": 0.24,
"grad_norm": 0.12795256078243256,
"learning_rate": 0.0003862323899795243,
"loss": 1.6418,
"step": 9340
},
{
"epoch": 0.24,
"grad_norm": 0.12998293340206146,
"learning_rate": 0.000386203237134976,
"loss": 1.645,
"step": 9350
},
{
"epoch": 0.24,
"grad_norm": 0.13535144925117493,
"learning_rate": 0.00038617405456016187,
"loss": 1.7063,
"step": 9360
},
{
"epoch": 0.24,
"grad_norm": 0.14552073180675507,
"learning_rate": 0.00038614484225974144,
"loss": 1.7002,
"step": 9370
},
{
"epoch": 0.24,
"grad_norm": 0.13565002381801605,
"learning_rate": 0.00038611560023837883,
"loss": 1.6609,
"step": 9380
},
{
"epoch": 0.24,
"grad_norm": 0.13093070685863495,
"learning_rate": 0.00038608632850074305,
"loss": 1.6438,
"step": 9390
},
{
"epoch": 0.24,
"grad_norm": 0.15767642855644226,
"learning_rate": 0.0003860570270515077,
"loss": 1.7212,
"step": 9400
},
{
"epoch": 0.24,
"grad_norm": 0.12853416800498962,
"learning_rate": 0.0003860276958953514,
"loss": 1.6376,
"step": 9410
},
{
"epoch": 0.24,
"grad_norm": 0.15520334243774414,
"learning_rate": 0.00038599833503695713,
"loss": 1.7403,
"step": 9420
},
{
"epoch": 0.24,
"grad_norm": 0.13134929537773132,
"learning_rate": 0.00038596894448101297,
"loss": 1.6265,
"step": 9430
},
{
"epoch": 0.24,
"grad_norm": 0.12647289037704468,
"learning_rate": 0.0003859395242322115,
"loss": 1.6685,
"step": 9440
},
{
"epoch": 0.24,
"grad_norm": 0.1194876879453659,
"learning_rate": 0.00038591007429525,
"loss": 1.629,
"step": 9450
},
{
"epoch": 0.24,
"grad_norm": 0.12292616069316864,
"learning_rate": 0.0003858805946748309,
"loss": 1.6883,
"step": 9460
},
{
"epoch": 0.24,
"grad_norm": 0.1364501565694809,
"learning_rate": 0.00038585108537566085,
"loss": 1.6677,
"step": 9470
},
{
"epoch": 0.24,
"grad_norm": 0.1332101970911026,
"learning_rate": 0.00038582154640245156,
"loss": 1.6828,
"step": 9480
},
{
"epoch": 0.24,
"grad_norm": 0.13498814404010773,
"learning_rate": 0.0003857919777599194,
"loss": 1.6912,
"step": 9490
},
{
"epoch": 0.24,
"grad_norm": 0.1320127248764038,
"learning_rate": 0.00038576237945278543,
"loss": 1.6613,
"step": 9500
},
{
"epoch": 0.24,
"grad_norm": 0.12627199292182922,
"learning_rate": 0.0003857327514857755,
"loss": 1.6771,
"step": 9510
},
{
"epoch": 0.24,
"grad_norm": 0.12172795087099075,
"learning_rate": 0.00038570309386362015,
"loss": 1.6874,
"step": 9520
},
{
"epoch": 0.24,
"grad_norm": 0.14356915652751923,
"learning_rate": 0.00038567340659105483,
"loss": 1.7213,
"step": 9530
},
{
"epoch": 0.24,
"grad_norm": 0.14241376519203186,
"learning_rate": 0.00038564368967281936,
"loss": 1.7341,
"step": 9540
},
{
"epoch": 0.24,
"grad_norm": 0.15648552775382996,
"learning_rate": 0.00038561394311365866,
"loss": 1.7055,
"step": 9550
},
{
"epoch": 0.24,
"grad_norm": 0.1445528119802475,
"learning_rate": 0.00038558416691832217,
"loss": 1.6674,
"step": 9560
},
{
"epoch": 0.24,
"grad_norm": 0.13047567009925842,
"learning_rate": 0.0003855543610915642,
"loss": 1.6434,
"step": 9570
},
{
"epoch": 0.24,
"grad_norm": 0.137510746717453,
"learning_rate": 0.0003855245256381436,
"loss": 1.6924,
"step": 9580
},
{
"epoch": 0.24,
"grad_norm": 0.12807559967041016,
"learning_rate": 0.00038549466056282417,
"loss": 1.6805,
"step": 9590
},
{
"epoch": 0.24,
"grad_norm": 0.13783133029937744,
"learning_rate": 0.0003854647658703743,
"loss": 1.6968,
"step": 9600
},
{
"epoch": 0.24,
"grad_norm": 0.13350753486156464,
"learning_rate": 0.0003854348415655671,
"loss": 1.6599,
"step": 9610
},
{
"epoch": 0.24,
"grad_norm": 0.15386106073856354,
"learning_rate": 0.0003854048876531805,
"loss": 1.6706,
"step": 9620
},
{
"epoch": 0.24,
"grad_norm": 0.14253292977809906,
"learning_rate": 0.0003853749041379972,
"loss": 1.6567,
"step": 9630
},
{
"epoch": 0.25,
"grad_norm": 0.1428574025630951,
"learning_rate": 0.00038534489102480437,
"loss": 1.6911,
"step": 9640
},
{
"epoch": 0.25,
"grad_norm": 0.12866978347301483,
"learning_rate": 0.0003853148483183942,
"loss": 1.6676,
"step": 9650
},
{
"epoch": 0.25,
"grad_norm": 0.12328783422708511,
"learning_rate": 0.00038528477602356345,
"loss": 1.639,
"step": 9660
},
{
"epoch": 0.25,
"grad_norm": 0.12650468945503235,
"learning_rate": 0.00038525467414511356,
"loss": 1.6606,
"step": 9670
},
{
"epoch": 0.25,
"grad_norm": 0.13679854571819305,
"learning_rate": 0.0003852245426878508,
"loss": 1.6582,
"step": 9680
},
{
"epoch": 0.25,
"grad_norm": 0.13160301744937897,
"learning_rate": 0.0003851943816565862,
"loss": 1.6697,
"step": 9690
},
{
"epoch": 0.25,
"grad_norm": 0.13433463871479034,
"learning_rate": 0.00038516419105613544,
"loss": 1.5862,
"step": 9700
},
{
"epoch": 0.25,
"grad_norm": 0.12435674667358398,
"learning_rate": 0.0003851339708913188,
"loss": 1.6992,
"step": 9710
},
{
"epoch": 0.25,
"grad_norm": 0.1282668113708496,
"learning_rate": 0.0003851037211669615,
"loss": 1.6415,
"step": 9720
},
{
"epoch": 0.25,
"grad_norm": 0.13300156593322754,
"learning_rate": 0.00038507344188789335,
"loss": 1.6636,
"step": 9730
},
{
"epoch": 0.25,
"grad_norm": 0.13623046875,
"learning_rate": 0.00038504313305894896,
"loss": 1.6138,
"step": 9740
},
{
"epoch": 0.25,
"grad_norm": 0.1425723135471344,
"learning_rate": 0.0003850127946849676,
"loss": 1.6753,
"step": 9750
},
{
"epoch": 0.25,
"grad_norm": 0.1205907091498375,
"learning_rate": 0.0003849824267707932,
"loss": 1.6981,
"step": 9760
},
{
"epoch": 0.25,
"grad_norm": 0.13399383425712585,
"learning_rate": 0.0003849520293212745,
"loss": 1.6687,
"step": 9770
},
{
"epoch": 0.25,
"grad_norm": 0.14110614359378815,
"learning_rate": 0.000384921602341265,
"loss": 1.6718,
"step": 9780
},
{
"epoch": 0.25,
"grad_norm": 0.14635838568210602,
"learning_rate": 0.0003848911458356227,
"loss": 1.6424,
"step": 9790
},
{
"epoch": 0.25,
"grad_norm": 0.12122263759374619,
"learning_rate": 0.0003848606598092106,
"loss": 1.6415,
"step": 9800
},
{
"epoch": 0.25,
"grad_norm": 0.13565990328788757,
"learning_rate": 0.00038483014426689624,
"loss": 1.6716,
"step": 9810
},
{
"epoch": 0.25,
"grad_norm": 0.1413806825876236,
"learning_rate": 0.0003847995992135519,
"loss": 1.6768,
"step": 9820
},
{
"epoch": 0.25,
"grad_norm": 0.14945995807647705,
"learning_rate": 0.0003847690246540545,
"loss": 1.6907,
"step": 9830
},
{
"epoch": 0.25,
"grad_norm": 0.1399151086807251,
"learning_rate": 0.00038473842059328587,
"loss": 1.7095,
"step": 9840
},
{
"epoch": 0.25,
"grad_norm": 0.1341298669576645,
"learning_rate": 0.00038470778703613226,
"loss": 1.6107,
"step": 9850
},
{
"epoch": 0.25,
"grad_norm": 0.1368744671344757,
"learning_rate": 0.00038467712398748504,
"loss": 1.6942,
"step": 9860
},
{
"epoch": 0.25,
"grad_norm": 0.12841053307056427,
"learning_rate": 0.00038464643145223976,
"loss": 1.6767,
"step": 9870
},
{
"epoch": 0.25,
"grad_norm": 0.12721240520477295,
"learning_rate": 0.0003846157094352973,
"loss": 1.6485,
"step": 9880
},
{
"epoch": 0.25,
"grad_norm": 0.1607305407524109,
"learning_rate": 0.0003845849579415626,
"loss": 1.6569,
"step": 9890
},
{
"epoch": 0.25,
"grad_norm": 0.12862354516983032,
"learning_rate": 0.00038455417697594573,
"loss": 1.6933,
"step": 9900
},
{
"epoch": 0.25,
"grad_norm": 0.12964557111263275,
"learning_rate": 0.0003845233665433614,
"loss": 1.6559,
"step": 9910
},
{
"epoch": 0.25,
"grad_norm": 0.13759677112102509,
"learning_rate": 0.00038449252664872897,
"loss": 1.6401,
"step": 9920
},
{
"epoch": 0.25,
"grad_norm": 0.13009068369865417,
"learning_rate": 0.00038446165729697244,
"loss": 1.7112,
"step": 9930
},
{
"epoch": 0.25,
"grad_norm": 0.12266763299703598,
"learning_rate": 0.0003844307584930207,
"loss": 1.6665,
"step": 9940
},
{
"epoch": 0.25,
"grad_norm": 0.16423115134239197,
"learning_rate": 0.0003843998302418071,
"loss": 1.6358,
"step": 9950
},
{
"epoch": 0.25,
"grad_norm": 0.1375732421875,
"learning_rate": 0.0003843688725482699,
"loss": 1.643,
"step": 9960
},
{
"epoch": 0.25,
"grad_norm": 0.1389283388853073,
"learning_rate": 0.000384337885417352,
"loss": 1.7376,
"step": 9970
},
{
"epoch": 0.25,
"grad_norm": 0.14045849442481995,
"learning_rate": 0.00038430686885400096,
"loss": 1.6542,
"step": 9980
},
{
"epoch": 0.25,
"grad_norm": 0.1282881647348404,
"learning_rate": 0.000384275822863169,
"loss": 1.6977,
"step": 9990
},
{
"epoch": 0.25,
"grad_norm": 0.14488451182842255,
"learning_rate": 0.00038424474744981315,
"loss": 1.6779,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 78622,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"total_flos": 8.458132783104e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}