pixelmelt's picture
Upload 11 files
4c48754 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.877450980392156,
"eval_steps": 500,
"global_step": 408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 4.875,
"learning_rate": 2e-05,
"loss": 7.5441,
"step": 1
},
{
"epoch": 0.04,
"grad_norm": 4.8125,
"learning_rate": 4e-05,
"loss": 7.8502,
"step": 2
},
{
"epoch": 0.06,
"grad_norm": 5.0,
"learning_rate": 6e-05,
"loss": 7.9553,
"step": 3
},
{
"epoch": 0.08,
"grad_norm": 5.03125,
"learning_rate": 8e-05,
"loss": 7.4329,
"step": 4
},
{
"epoch": 0.1,
"grad_norm": 5.6875,
"learning_rate": 0.0001,
"loss": 6.7046,
"step": 5
},
{
"epoch": 0.12,
"grad_norm": 2.984375,
"learning_rate": 0.00012,
"loss": 6.1039,
"step": 6
},
{
"epoch": 0.14,
"grad_norm": 2.03125,
"learning_rate": 0.00014,
"loss": 5.6611,
"step": 7
},
{
"epoch": 0.16,
"grad_norm": 1.734375,
"learning_rate": 0.00016,
"loss": 5.0838,
"step": 8
},
{
"epoch": 0.18,
"grad_norm": 2.265625,
"learning_rate": 0.00018,
"loss": 4.8258,
"step": 9
},
{
"epoch": 0.2,
"grad_norm": 2.671875,
"learning_rate": 0.0002,
"loss": 4.4867,
"step": 10
},
{
"epoch": 0.22,
"grad_norm": 1.921875,
"learning_rate": 0.00019999688468941564,
"loss": 4.0076,
"step": 11
},
{
"epoch": 0.24,
"grad_norm": 1.46875,
"learning_rate": 0.00019998753895176575,
"loss": 3.8401,
"step": 12
},
{
"epoch": 0.25,
"grad_norm": 1.203125,
"learning_rate": 0.0001999719633693478,
"loss": 3.8209,
"step": 13
},
{
"epoch": 0.27,
"grad_norm": 0.796875,
"learning_rate": 0.0001999501589126174,
"loss": 3.7106,
"step": 14
},
{
"epoch": 0.29,
"grad_norm": 1.2265625,
"learning_rate": 0.00019992212694012757,
"loss": 3.7207,
"step": 15
},
{
"epoch": 0.31,
"grad_norm": 0.59765625,
"learning_rate": 0.00019988786919844436,
"loss": 3.3611,
"step": 16
},
{
"epoch": 0.33,
"grad_norm": 0.66796875,
"learning_rate": 0.0001998473878220379,
"loss": 3.4981,
"step": 17
},
{
"epoch": 0.35,
"grad_norm": 0.67578125,
"learning_rate": 0.00019980068533314934,
"loss": 3.5695,
"step": 18
},
{
"epoch": 0.37,
"grad_norm": 0.4453125,
"learning_rate": 0.00019974776464163387,
"loss": 3.5129,
"step": 19
},
{
"epoch": 0.39,
"grad_norm": 0.337890625,
"learning_rate": 0.00019968862904477935,
"loss": 3.4233,
"step": 20
},
{
"epoch": 0.41,
"grad_norm": 0.4609375,
"learning_rate": 0.0001996232822271007,
"loss": 3.4322,
"step": 21
},
{
"epoch": 0.43,
"grad_norm": 0.353515625,
"learning_rate": 0.00019955172826011062,
"loss": 3.335,
"step": 22
},
{
"epoch": 0.45,
"grad_norm": 0.30078125,
"learning_rate": 0.0001994739716020657,
"loss": 3.5036,
"step": 23
},
{
"epoch": 0.47,
"grad_norm": 0.375,
"learning_rate": 0.0001993900170976888,
"loss": 3.3962,
"step": 24
},
{
"epoch": 0.49,
"grad_norm": 0.4609375,
"learning_rate": 0.00019929986997786699,
"loss": 3.3721,
"step": 25
},
{
"epoch": 0.51,
"grad_norm": 0.30078125,
"learning_rate": 0.00019920353585932578,
"loss": 3.4649,
"step": 26
},
{
"epoch": 0.53,
"grad_norm": 0.2060546875,
"learning_rate": 0.0001991010207442792,
"loss": 3.2541,
"step": 27
},
{
"epoch": 0.55,
"grad_norm": 0.5078125,
"learning_rate": 0.00019899233102005573,
"loss": 3.3746,
"step": 28
},
{
"epoch": 0.57,
"grad_norm": 0.349609375,
"learning_rate": 0.00019887747345870028,
"loss": 3.3363,
"step": 29
},
{
"epoch": 0.59,
"grad_norm": 0.2421875,
"learning_rate": 0.0001987564552165524,
"loss": 3.3452,
"step": 30
},
{
"epoch": 0.61,
"grad_norm": 0.40625,
"learning_rate": 0.0001986292838338003,
"loss": 3.3879,
"step": 31
},
{
"epoch": 0.63,
"grad_norm": 0.263671875,
"learning_rate": 0.00019849596723401107,
"loss": 3.2731,
"step": 32
},
{
"epoch": 0.65,
"grad_norm": 0.2080078125,
"learning_rate": 0.000198356513723637,
"loss": 3.3342,
"step": 33
},
{
"epoch": 0.67,
"grad_norm": 0.3671875,
"learning_rate": 0.00019821093199149804,
"loss": 3.3667,
"step": 34
},
{
"epoch": 0.69,
"grad_norm": 0.3046875,
"learning_rate": 0.0001980592311082404,
"loss": 3.3652,
"step": 35
},
{
"epoch": 0.71,
"grad_norm": 0.2060546875,
"learning_rate": 0.0001979014205257715,
"loss": 3.2905,
"step": 36
},
{
"epoch": 0.73,
"grad_norm": 0.33203125,
"learning_rate": 0.00019773751007667073,
"loss": 3.3441,
"step": 37
},
{
"epoch": 0.75,
"grad_norm": 0.3671875,
"learning_rate": 0.0001975675099735774,
"loss": 3.3104,
"step": 38
},
{
"epoch": 0.76,
"grad_norm": 0.20703125,
"learning_rate": 0.00019739143080855378,
"loss": 3.3328,
"step": 39
},
{
"epoch": 0.78,
"grad_norm": 0.3125,
"learning_rate": 0.00019720928355242568,
"loss": 3.2689,
"step": 40
},
{
"epoch": 0.8,
"grad_norm": 0.236328125,
"learning_rate": 0.00019702107955409863,
"loss": 3.2674,
"step": 41
},
{
"epoch": 0.82,
"grad_norm": 0.2001953125,
"learning_rate": 0.00019682683053985072,
"loss": 3.246,
"step": 42
},
{
"epoch": 0.84,
"grad_norm": 0.302734375,
"learning_rate": 0.0001966265486126022,
"loss": 3.2299,
"step": 43
},
{
"epoch": 0.86,
"grad_norm": 0.25,
"learning_rate": 0.00019642024625116117,
"loss": 3.347,
"step": 44
},
{
"epoch": 0.88,
"grad_norm": 0.212890625,
"learning_rate": 0.0001962079363094463,
"loss": 3.2526,
"step": 45
},
{
"epoch": 0.9,
"grad_norm": 0.2275390625,
"learning_rate": 0.00019598963201568573,
"loss": 3.1977,
"step": 46
},
{
"epoch": 0.92,
"grad_norm": 0.228515625,
"learning_rate": 0.00019576534697159296,
"loss": 3.3306,
"step": 47
},
{
"epoch": 0.94,
"grad_norm": 0.2041015625,
"learning_rate": 0.0001955350951515195,
"loss": 3.2819,
"step": 48
},
{
"epoch": 0.96,
"grad_norm": 0.3046875,
"learning_rate": 0.00019529889090158392,
"loss": 3.2065,
"step": 49
},
{
"epoch": 0.98,
"grad_norm": 0.2490234375,
"learning_rate": 0.0001950567489387783,
"loss": 3.2446,
"step": 50
},
{
"epoch": 1.0,
"grad_norm": 0.2890625,
"learning_rate": 0.00019480868435005095,
"loss": 3.1567,
"step": 51
},
{
"epoch": 1.02,
"grad_norm": 0.3359375,
"learning_rate": 0.0001945547125913667,
"loss": 3.2422,
"step": 52
},
{
"epoch": 1.01,
"grad_norm": 0.251953125,
"learning_rate": 0.00019429484948674372,
"loss": 3.2118,
"step": 53
},
{
"epoch": 1.03,
"grad_norm": 0.373046875,
"learning_rate": 0.00019402911122726757,
"loss": 3.1756,
"step": 54
},
{
"epoch": 1.05,
"grad_norm": 0.291015625,
"learning_rate": 0.00019375751437008252,
"loss": 3.1951,
"step": 55
},
{
"epoch": 1.07,
"grad_norm": 0.291015625,
"learning_rate": 0.00019348007583735983,
"loss": 3.1604,
"step": 56
},
{
"epoch": 1.09,
"grad_norm": 0.259765625,
"learning_rate": 0.0001931968129152435,
"loss": 3.1231,
"step": 57
},
{
"epoch": 1.11,
"grad_norm": 0.259765625,
"learning_rate": 0.00019290774325277305,
"loss": 3.1371,
"step": 58
},
{
"epoch": 1.13,
"grad_norm": 0.37109375,
"learning_rate": 0.00019261288486078414,
"loss": 3.2218,
"step": 59
},
{
"epoch": 1.15,
"grad_norm": 0.251953125,
"learning_rate": 0.0001923122561107861,
"loss": 3.178,
"step": 60
},
{
"epoch": 1.17,
"grad_norm": 0.283203125,
"learning_rate": 0.00019200587573381744,
"loss": 3.1543,
"step": 61
},
{
"epoch": 1.19,
"grad_norm": 0.267578125,
"learning_rate": 0.00019169376281927888,
"loss": 3.1088,
"step": 62
},
{
"epoch": 1.21,
"grad_norm": 0.244140625,
"learning_rate": 0.0001913759368137437,
"loss": 3.1465,
"step": 63
},
{
"epoch": 1.23,
"grad_norm": 0.353515625,
"learning_rate": 0.00019105241751974622,
"loss": 3.1484,
"step": 64
},
{
"epoch": 1.25,
"grad_norm": 0.294921875,
"learning_rate": 0.00019072322509454815,
"loss": 3.127,
"step": 65
},
{
"epoch": 1.26,
"grad_norm": 0.2265625,
"learning_rate": 0.0001903883800488824,
"loss": 3.123,
"step": 66
},
{
"epoch": 1.28,
"grad_norm": 0.271484375,
"learning_rate": 0.00019004790324567519,
"loss": 3.1802,
"step": 67
},
{
"epoch": 1.3,
"grad_norm": 0.23828125,
"learning_rate": 0.00018970181589874637,
"loss": 3.0633,
"step": 68
},
{
"epoch": 1.32,
"grad_norm": 0.3203125,
"learning_rate": 0.00018935013957148742,
"loss": 3.1452,
"step": 69
},
{
"epoch": 1.34,
"grad_norm": 0.265625,
"learning_rate": 0.00018899289617551804,
"loss": 3.0458,
"step": 70
},
{
"epoch": 1.36,
"grad_norm": 0.287109375,
"learning_rate": 0.0001886301079693209,
"loss": 3.1749,
"step": 71
},
{
"epoch": 1.38,
"grad_norm": 0.25390625,
"learning_rate": 0.0001882617975568547,
"loss": 3.1142,
"step": 72
},
{
"epoch": 1.4,
"grad_norm": 0.2734375,
"learning_rate": 0.00018788798788614607,
"loss": 3.0497,
"step": 73
},
{
"epoch": 1.42,
"grad_norm": 0.28515625,
"learning_rate": 0.00018750870224785939,
"loss": 3.1338,
"step": 74
},
{
"epoch": 1.44,
"grad_norm": 0.267578125,
"learning_rate": 0.00018712396427384594,
"loss": 3.1228,
"step": 75
},
{
"epoch": 1.46,
"grad_norm": 0.341796875,
"learning_rate": 0.00018673379793567146,
"loss": 3.1073,
"step": 76
},
{
"epoch": 1.48,
"grad_norm": 0.248046875,
"learning_rate": 0.00018633822754312234,
"loss": 3.0681,
"step": 77
},
{
"epoch": 1.5,
"grad_norm": 0.4140625,
"learning_rate": 0.0001859372777426912,
"loss": 3.0805,
"step": 78
},
{
"epoch": 1.52,
"grad_norm": 0.259765625,
"learning_rate": 0.00018553097351604118,
"loss": 3.0641,
"step": 79
},
{
"epoch": 1.54,
"grad_norm": 0.453125,
"learning_rate": 0.00018511934017844948,
"loss": 3.011,
"step": 80
},
{
"epoch": 1.56,
"grad_norm": 0.28515625,
"learning_rate": 0.00018470240337722991,
"loss": 3.0556,
"step": 81
},
{
"epoch": 1.58,
"grad_norm": 0.37890625,
"learning_rate": 0.00018428018909013506,
"loss": 3.1694,
"step": 82
},
{
"epoch": 1.6,
"grad_norm": 0.263671875,
"learning_rate": 0.00018385272362373775,
"loss": 3.1034,
"step": 83
},
{
"epoch": 1.62,
"grad_norm": 0.341796875,
"learning_rate": 0.00018342003361179176,
"loss": 3.0954,
"step": 84
},
{
"epoch": 1.64,
"grad_norm": 0.287109375,
"learning_rate": 0.0001829821460135726,
"loss": 3.0729,
"step": 85
},
{
"epoch": 1.66,
"grad_norm": 0.259765625,
"learning_rate": 0.00018253908811219764,
"loss": 3.0459,
"step": 86
},
{
"epoch": 1.68,
"grad_norm": 0.384765625,
"learning_rate": 0.00018209088751292626,
"loss": 3.0627,
"step": 87
},
{
"epoch": 1.7,
"grad_norm": 0.236328125,
"learning_rate": 0.00018163757214143992,
"loss": 3.0817,
"step": 88
},
{
"epoch": 1.72,
"grad_norm": 0.40234375,
"learning_rate": 0.00018117917024210208,
"loss": 3.0809,
"step": 89
},
{
"epoch": 1.74,
"grad_norm": 0.3984375,
"learning_rate": 0.00018071571037619853,
"loss": 3.0377,
"step": 90
},
{
"epoch": 1.75,
"grad_norm": 0.625,
"learning_rate": 0.00018024722142015781,
"loss": 3.0733,
"step": 91
},
{
"epoch": 1.77,
"grad_norm": 0.41796875,
"learning_rate": 0.00017977373256375194,
"loss": 3.0641,
"step": 92
},
{
"epoch": 1.79,
"grad_norm": 0.62109375,
"learning_rate": 0.00017929527330827786,
"loss": 3.1211,
"step": 93
},
{
"epoch": 1.81,
"grad_norm": 0.55078125,
"learning_rate": 0.00017881187346471925,
"loss": 3.1069,
"step": 94
},
{
"epoch": 1.83,
"grad_norm": 0.60546875,
"learning_rate": 0.00017832356315188906,
"loss": 2.9687,
"step": 95
},
{
"epoch": 1.85,
"grad_norm": 0.609375,
"learning_rate": 0.00017783037279455298,
"loss": 3.1262,
"step": 96
},
{
"epoch": 1.87,
"grad_norm": 0.275390625,
"learning_rate": 0.00017733233312153393,
"loss": 3.0263,
"step": 97
},
{
"epoch": 1.89,
"grad_norm": 0.72265625,
"learning_rate": 0.00017682947516379707,
"loss": 3.0806,
"step": 98
},
{
"epoch": 1.91,
"grad_norm": 0.359375,
"learning_rate": 0.00017632183025251686,
"loss": 3.0914,
"step": 99
},
{
"epoch": 1.93,
"grad_norm": 0.50390625,
"learning_rate": 0.00017580943001712455,
"loss": 3.0555,
"step": 100
},
{
"epoch": 1.95,
"grad_norm": 0.5859375,
"learning_rate": 0.00017529230638333772,
"loss": 2.9838,
"step": 101
},
{
"epoch": 1.97,
"grad_norm": 0.318359375,
"learning_rate": 0.00017477049157117093,
"loss": 2.9647,
"step": 102
},
{
"epoch": 1.99,
"grad_norm": 0.55078125,
"learning_rate": 0.00017424401809292833,
"loss": 3.03,
"step": 103
},
{
"epoch": 2.01,
"grad_norm": 0.380859375,
"learning_rate": 0.0001737129187511779,
"loss": 3.0758,
"step": 104
},
{
"epoch": 2.01,
"grad_norm": 0.447265625,
"learning_rate": 0.0001731772266367077,
"loss": 3.035,
"step": 105
},
{
"epoch": 2.03,
"grad_norm": 0.396484375,
"learning_rate": 0.00017263697512646394,
"loss": 3.0058,
"step": 106
},
{
"epoch": 2.05,
"grad_norm": 0.28515625,
"learning_rate": 0.00017209219788147167,
"loss": 2.9659,
"step": 107
},
{
"epoch": 2.07,
"grad_norm": 0.462890625,
"learning_rate": 0.00017154292884473713,
"loss": 2.8561,
"step": 108
},
{
"epoch": 2.09,
"grad_norm": 0.3359375,
"learning_rate": 0.0001709892022391333,
"loss": 2.9509,
"step": 109
},
{
"epoch": 2.11,
"grad_norm": 0.365234375,
"learning_rate": 0.00017043105256526724,
"loss": 2.998,
"step": 110
},
{
"epoch": 2.13,
"grad_norm": 0.416015625,
"learning_rate": 0.00016986851459933067,
"loss": 2.9808,
"step": 111
},
{
"epoch": 2.15,
"grad_norm": 0.275390625,
"learning_rate": 0.00016930162339093318,
"loss": 2.9386,
"step": 112
},
{
"epoch": 2.17,
"grad_norm": 0.369140625,
"learning_rate": 0.00016873041426091845,
"loss": 2.9435,
"step": 113
},
{
"epoch": 2.19,
"grad_norm": 0.365234375,
"learning_rate": 0.0001681549227991634,
"loss": 2.9056,
"step": 114
},
{
"epoch": 2.21,
"grad_norm": 0.3125,
"learning_rate": 0.00016757518486236087,
"loss": 2.9288,
"step": 115
},
{
"epoch": 2.23,
"grad_norm": 0.328125,
"learning_rate": 0.00016699123657178553,
"loss": 2.9522,
"step": 116
},
{
"epoch": 2.25,
"grad_norm": 0.326171875,
"learning_rate": 0.0001664031143110431,
"loss": 2.838,
"step": 117
},
{
"epoch": 2.27,
"grad_norm": 0.34375,
"learning_rate": 0.00016581085472380376,
"loss": 2.9151,
"step": 118
},
{
"epoch": 2.29,
"grad_norm": 0.39453125,
"learning_rate": 0.00016521449471151867,
"loss": 2.9665,
"step": 119
},
{
"epoch": 2.31,
"grad_norm": 0.3359375,
"learning_rate": 0.00016461407143112097,
"loss": 2.9064,
"step": 120
},
{
"epoch": 2.33,
"grad_norm": 0.33203125,
"learning_rate": 0.00016400962229271072,
"loss": 2.8611,
"step": 121
},
{
"epoch": 2.35,
"grad_norm": 0.3125,
"learning_rate": 0.00016340118495722388,
"loss": 2.9698,
"step": 122
},
{
"epoch": 2.37,
"grad_norm": 0.3203125,
"learning_rate": 0.00016278879733408585,
"loss": 2.8706,
"step": 123
},
{
"epoch": 2.39,
"grad_norm": 0.3046875,
"learning_rate": 0.00016217249757884955,
"loss": 3.0017,
"step": 124
},
{
"epoch": 2.41,
"grad_norm": 0.39453125,
"learning_rate": 0.00016155232409081793,
"loss": 2.9088,
"step": 125
},
{
"epoch": 2.43,
"grad_norm": 0.337890625,
"learning_rate": 0.0001609283155106517,
"loss": 2.8959,
"step": 126
},
{
"epoch": 2.45,
"grad_norm": 0.373046875,
"learning_rate": 0.00016030051071796146,
"loss": 2.909,
"step": 127
},
{
"epoch": 2.47,
"grad_norm": 0.39453125,
"learning_rate": 0.00015966894882888562,
"loss": 2.9291,
"step": 128
},
{
"epoch": 2.49,
"grad_norm": 0.3203125,
"learning_rate": 0.00015903366919365282,
"loss": 2.9638,
"step": 129
},
{
"epoch": 2.5,
"grad_norm": 0.380859375,
"learning_rate": 0.00015839471139413066,
"loss": 2.9725,
"step": 130
},
{
"epoch": 2.52,
"grad_norm": 0.34765625,
"learning_rate": 0.0001577521152413589,
"loss": 3.0265,
"step": 131
},
{
"epoch": 2.54,
"grad_norm": 0.310546875,
"learning_rate": 0.0001571059207730695,
"loss": 2.8662,
"step": 132
},
{
"epoch": 2.56,
"grad_norm": 0.3203125,
"learning_rate": 0.0001564561682511918,
"loss": 2.9864,
"step": 133
},
{
"epoch": 2.58,
"grad_norm": 0.349609375,
"learning_rate": 0.00015580289815934401,
"loss": 2.9896,
"step": 134
},
{
"epoch": 2.6,
"grad_norm": 0.39453125,
"learning_rate": 0.00015514615120031076,
"loss": 2.9797,
"step": 135
},
{
"epoch": 2.62,
"grad_norm": 0.318359375,
"learning_rate": 0.00015448596829350706,
"loss": 2.9245,
"step": 136
},
{
"epoch": 2.64,
"grad_norm": 0.431640625,
"learning_rate": 0.00015382239057242888,
"loss": 2.9472,
"step": 137
},
{
"epoch": 2.66,
"grad_norm": 0.326171875,
"learning_rate": 0.00015315545938209015,
"loss": 3.0054,
"step": 138
},
{
"epoch": 2.68,
"grad_norm": 0.33984375,
"learning_rate": 0.00015248521627644684,
"loss": 3.002,
"step": 139
},
{
"epoch": 2.7,
"grad_norm": 0.41015625,
"learning_rate": 0.00015181170301580777,
"loss": 2.9129,
"step": 140
},
{
"epoch": 2.72,
"grad_norm": 0.328125,
"learning_rate": 0.0001511349615642327,
"loss": 2.9397,
"step": 141
},
{
"epoch": 2.74,
"grad_norm": 0.470703125,
"learning_rate": 0.00015045503408691775,
"loss": 2.9214,
"step": 142
},
{
"epoch": 2.76,
"grad_norm": 0.349609375,
"learning_rate": 0.00014977196294756832,
"loss": 2.9527,
"step": 143
},
{
"epoch": 2.78,
"grad_norm": 0.375,
"learning_rate": 0.00014908579070575936,
"loss": 2.9482,
"step": 144
},
{
"epoch": 2.8,
"grad_norm": 0.3515625,
"learning_rate": 0.00014839656011428389,
"loss": 3.0022,
"step": 145
},
{
"epoch": 2.82,
"grad_norm": 0.32421875,
"learning_rate": 0.00014770431411648897,
"loss": 2.9553,
"step": 146
},
{
"epoch": 2.84,
"grad_norm": 0.4296875,
"learning_rate": 0.0001470090958436003,
"loss": 2.9322,
"step": 147
},
{
"epoch": 2.86,
"grad_norm": 0.34375,
"learning_rate": 0.0001463109486120348,
"loss": 2.9479,
"step": 148
},
{
"epoch": 2.88,
"grad_norm": 0.404296875,
"learning_rate": 0.00014560991592070158,
"loss": 2.8904,
"step": 149
},
{
"epoch": 2.9,
"grad_norm": 0.33203125,
"learning_rate": 0.00014490604144829202,
"loss": 2.9523,
"step": 150
},
{
"epoch": 2.92,
"grad_norm": 0.3828125,
"learning_rate": 0.00014419936905055793,
"loss": 2.9384,
"step": 151
},
{
"epoch": 2.94,
"grad_norm": 0.376953125,
"learning_rate": 0.00014348994275757931,
"loss": 2.9068,
"step": 152
},
{
"epoch": 2.96,
"grad_norm": 0.373046875,
"learning_rate": 0.00014277780677102097,
"loss": 2.9824,
"step": 153
},
{
"epoch": 2.98,
"grad_norm": 0.34375,
"learning_rate": 0.00014206300546137842,
"loss": 2.9305,
"step": 154
},
{
"epoch": 3.0,
"grad_norm": 0.322265625,
"learning_rate": 0.00014134558336521342,
"loss": 2.9497,
"step": 155
},
{
"epoch": 3.01,
"grad_norm": 0.35546875,
"learning_rate": 0.00014062558518237892,
"loss": 2.9443,
"step": 156
},
{
"epoch": 3.02,
"grad_norm": 0.34765625,
"learning_rate": 0.0001399030557732341,
"loss": 2.7934,
"step": 157
},
{
"epoch": 3.04,
"grad_norm": 0.345703125,
"learning_rate": 0.00013917804015584932,
"loss": 2.8409,
"step": 158
},
{
"epoch": 3.06,
"grad_norm": 0.359375,
"learning_rate": 0.00013845058350320108,
"loss": 2.7915,
"step": 159
},
{
"epoch": 3.08,
"grad_norm": 0.396484375,
"learning_rate": 0.00013772073114035762,
"loss": 2.8044,
"step": 160
},
{
"epoch": 3.1,
"grad_norm": 0.3828125,
"learning_rate": 0.0001369885285416547,
"loss": 2.7569,
"step": 161
},
{
"epoch": 3.12,
"grad_norm": 0.390625,
"learning_rate": 0.00013625402132786248,
"loss": 2.7233,
"step": 162
},
{
"epoch": 3.14,
"grad_norm": 0.404296875,
"learning_rate": 0.00013551725526334284,
"loss": 2.8137,
"step": 163
},
{
"epoch": 3.16,
"grad_norm": 0.3984375,
"learning_rate": 0.00013477827625319824,
"loss": 2.7915,
"step": 164
},
{
"epoch": 3.18,
"grad_norm": 0.419921875,
"learning_rate": 0.0001340371303404113,
"loss": 2.8062,
"step": 165
},
{
"epoch": 3.2,
"grad_norm": 0.41015625,
"learning_rate": 0.00013329386370297615,
"loss": 2.7944,
"step": 166
},
{
"epoch": 3.22,
"grad_norm": 0.5078125,
"learning_rate": 0.00013254852265102117,
"loss": 2.832,
"step": 167
},
{
"epoch": 3.24,
"grad_norm": 0.458984375,
"learning_rate": 0.00013180115362392382,
"loss": 2.802,
"step": 168
},
{
"epoch": 3.25,
"grad_norm": 0.44921875,
"learning_rate": 0.0001310518031874169,
"loss": 2.7101,
"step": 169
},
{
"epoch": 3.27,
"grad_norm": 0.515625,
"learning_rate": 0.00013030051803068727,
"loss": 2.8507,
"step": 170
},
{
"epoch": 3.29,
"grad_norm": 0.404296875,
"learning_rate": 0.00012954734496346704,
"loss": 2.8022,
"step": 171
},
{
"epoch": 3.31,
"grad_norm": 0.50390625,
"learning_rate": 0.00012879233091311667,
"loss": 2.8003,
"step": 172
},
{
"epoch": 3.33,
"grad_norm": 0.498046875,
"learning_rate": 0.00012803552292170144,
"loss": 2.8,
"step": 173
},
{
"epoch": 3.35,
"grad_norm": 0.51171875,
"learning_rate": 0.00012727696814306033,
"loss": 2.9019,
"step": 174
},
{
"epoch": 3.37,
"grad_norm": 0.49609375,
"learning_rate": 0.00012651671383986788,
"loss": 2.815,
"step": 175
},
{
"epoch": 3.39,
"grad_norm": 0.4453125,
"learning_rate": 0.0001257548073806897,
"loss": 2.8604,
"step": 176
},
{
"epoch": 3.41,
"grad_norm": 0.60546875,
"learning_rate": 0.00012499129623703086,
"loss": 2.7868,
"step": 177
},
{
"epoch": 3.43,
"grad_norm": 0.412109375,
"learning_rate": 0.00012422622798037832,
"loss": 2.8025,
"step": 178
},
{
"epoch": 3.45,
"grad_norm": 0.73828125,
"learning_rate": 0.0001234596502792369,
"loss": 2.7852,
"step": 179
},
{
"epoch": 3.47,
"grad_norm": 0.427734375,
"learning_rate": 0.000122691610896159,
"loss": 2.7422,
"step": 180
},
{
"epoch": 3.49,
"grad_norm": 0.44921875,
"learning_rate": 0.00012192215768476916,
"loss": 2.7908,
"step": 181
},
{
"epoch": 3.51,
"grad_norm": 0.6171875,
"learning_rate": 0.00012115133858678191,
"loss": 2.8865,
"step": 182
},
{
"epoch": 3.53,
"grad_norm": 0.458984375,
"learning_rate": 0.00012037920162901521,
"loss": 2.8135,
"step": 183
},
{
"epoch": 3.55,
"grad_norm": 0.546875,
"learning_rate": 0.00011960579492039783,
"loss": 2.8735,
"step": 184
},
{
"epoch": 3.57,
"grad_norm": 0.412109375,
"learning_rate": 0.00011883116664897178,
"loss": 2.7555,
"step": 185
},
{
"epoch": 3.59,
"grad_norm": 0.484375,
"learning_rate": 0.00011805536507889021,
"loss": 2.7936,
"step": 186
},
{
"epoch": 3.61,
"grad_norm": 0.55078125,
"learning_rate": 0.00011727843854740996,
"loss": 2.8138,
"step": 187
},
{
"epoch": 3.63,
"grad_norm": 0.50390625,
"learning_rate": 0.00011650043546187995,
"loss": 2.8566,
"step": 188
},
{
"epoch": 3.65,
"grad_norm": 0.48046875,
"learning_rate": 0.00011572140429672508,
"loss": 2.8295,
"step": 189
},
{
"epoch": 3.67,
"grad_norm": 0.46484375,
"learning_rate": 0.0001149413935904261,
"loss": 2.8278,
"step": 190
},
{
"epoch": 3.69,
"grad_norm": 0.486328125,
"learning_rate": 0.00011416045194249516,
"loss": 2.7748,
"step": 191
},
{
"epoch": 3.71,
"grad_norm": 0.5234375,
"learning_rate": 0.00011337862801044792,
"loss": 2.838,
"step": 192
},
{
"epoch": 3.73,
"grad_norm": 0.435546875,
"learning_rate": 0.00011259597050677178,
"loss": 2.7914,
"step": 193
},
{
"epoch": 3.75,
"grad_norm": 0.494140625,
"learning_rate": 0.00011181252819589081,
"loss": 2.7811,
"step": 194
},
{
"epoch": 3.76,
"grad_norm": 0.404296875,
"learning_rate": 0.00011102834989112751,
"loss": 2.8006,
"step": 195
},
{
"epoch": 3.78,
"grad_norm": 0.423828125,
"learning_rate": 0.00011024348445166133,
"loss": 2.8152,
"step": 196
},
{
"epoch": 3.8,
"grad_norm": 0.53515625,
"learning_rate": 0.0001094579807794845,
"loss": 2.8045,
"step": 197
},
{
"epoch": 3.82,
"grad_norm": 0.431640625,
"learning_rate": 0.00010867188781635512,
"loss": 2.8115,
"step": 198
},
{
"epoch": 3.84,
"grad_norm": 0.546875,
"learning_rate": 0.00010788525454074765,
"loss": 2.8645,
"step": 199
},
{
"epoch": 3.86,
"grad_norm": 0.474609375,
"learning_rate": 0.0001070981299648016,
"loss": 2.8005,
"step": 200
},
{
"epoch": 3.88,
"grad_norm": 0.4375,
"learning_rate": 0.00010631056313126734,
"loss": 2.8381,
"step": 201
},
{
"epoch": 3.9,
"grad_norm": 0.486328125,
"learning_rate": 0.00010552260311045082,
"loss": 2.8413,
"step": 202
},
{
"epoch": 3.92,
"grad_norm": 0.44140625,
"learning_rate": 0.00010473429899715581,
"loss": 2.8571,
"step": 203
},
{
"epoch": 3.94,
"grad_norm": 0.3984375,
"learning_rate": 0.00010394569990762529,
"loss": 2.8747,
"step": 204
},
{
"epoch": 3.96,
"grad_norm": 0.478515625,
"learning_rate": 0.00010315685497648106,
"loss": 2.8742,
"step": 205
},
{
"epoch": 3.98,
"grad_norm": 0.419921875,
"learning_rate": 0.00010236781335366239,
"loss": 2.8503,
"step": 206
},
{
"epoch": 4.0,
"grad_norm": 0.41796875,
"learning_rate": 0.0001015786242013637,
"loss": 2.8812,
"step": 207
},
{
"epoch": 4.02,
"grad_norm": 0.5703125,
"learning_rate": 0.00010078933669097135,
"loss": 2.8629,
"step": 208
},
{
"epoch": 4.02,
"grad_norm": 0.43359375,
"learning_rate": 0.0001,
"loss": 2.7461,
"step": 209
},
{
"epoch": 4.04,
"grad_norm": 0.53515625,
"learning_rate": 9.92106633090287e-05,
"loss": 2.6341,
"step": 210
},
{
"epoch": 4.06,
"grad_norm": 0.58203125,
"learning_rate": 9.842137579863632e-05,
"loss": 2.7251,
"step": 211
},
{
"epoch": 4.08,
"grad_norm": 0.7421875,
"learning_rate": 9.763218664633763e-05,
"loss": 2.6282,
"step": 212
},
{
"epoch": 4.1,
"grad_norm": 0.58984375,
"learning_rate": 9.684314502351894e-05,
"loss": 2.6809,
"step": 213
},
{
"epoch": 4.12,
"grad_norm": 0.578125,
"learning_rate": 9.605430009237474e-05,
"loss": 2.6245,
"step": 214
},
{
"epoch": 4.14,
"grad_norm": 0.75390625,
"learning_rate": 9.526570100284422e-05,
"loss": 2.6561,
"step": 215
},
{
"epoch": 4.16,
"grad_norm": 0.5078125,
"learning_rate": 9.447739688954919e-05,
"loss": 2.7119,
"step": 216
},
{
"epoch": 4.18,
"grad_norm": 0.5546875,
"learning_rate": 9.368943686873267e-05,
"loss": 2.7143,
"step": 217
},
{
"epoch": 4.2,
"grad_norm": 0.4921875,
"learning_rate": 9.29018700351984e-05,
"loss": 2.7154,
"step": 218
},
{
"epoch": 4.22,
"grad_norm": 0.56640625,
"learning_rate": 9.211474545925236e-05,
"loss": 2.7656,
"step": 219
},
{
"epoch": 4.24,
"grad_norm": 0.515625,
"learning_rate": 9.132811218364495e-05,
"loss": 2.6825,
"step": 220
},
{
"epoch": 4.25,
"grad_norm": 0.5234375,
"learning_rate": 9.054201922051552e-05,
"loss": 2.7077,
"step": 221
},
{
"epoch": 4.27,
"grad_norm": 0.494140625,
"learning_rate": 8.975651554833869e-05,
"loss": 2.6552,
"step": 222
},
{
"epoch": 4.29,
"grad_norm": 0.51953125,
"learning_rate": 8.89716501088725e-05,
"loss": 2.6438,
"step": 223
},
{
"epoch": 4.31,
"grad_norm": 0.498046875,
"learning_rate": 8.818747180410921e-05,
"loss": 2.685,
"step": 224
},
{
"epoch": 4.33,
"grad_norm": 0.546875,
"learning_rate": 8.740402949322827e-05,
"loss": 2.6834,
"step": 225
},
{
"epoch": 4.35,
"grad_norm": 0.53125,
"learning_rate": 8.66213719895521e-05,
"loss": 2.6793,
"step": 226
},
{
"epoch": 4.37,
"grad_norm": 0.51171875,
"learning_rate": 8.583954805750487e-05,
"loss": 2.6911,
"step": 227
},
{
"epoch": 4.39,
"grad_norm": 0.5390625,
"learning_rate": 8.505860640957391e-05,
"loss": 2.6713,
"step": 228
},
{
"epoch": 4.41,
"grad_norm": 0.515625,
"learning_rate": 8.427859570327494e-05,
"loss": 2.6732,
"step": 229
},
{
"epoch": 4.43,
"grad_norm": 0.5,
"learning_rate": 8.349956453812009e-05,
"loss": 2.7043,
"step": 230
},
{
"epoch": 4.45,
"grad_norm": 0.490234375,
"learning_rate": 8.272156145259006e-05,
"loss": 2.6899,
"step": 231
},
{
"epoch": 4.47,
"grad_norm": 0.5,
"learning_rate": 8.194463492110981e-05,
"loss": 2.6345,
"step": 232
},
{
"epoch": 4.49,
"grad_norm": 0.486328125,
"learning_rate": 8.11688333510282e-05,
"loss": 2.7625,
"step": 233
},
{
"epoch": 4.51,
"grad_norm": 0.4765625,
"learning_rate": 8.03942050796022e-05,
"loss": 2.676,
"step": 234
},
{
"epoch": 4.53,
"grad_norm": 0.53125,
"learning_rate": 7.96207983709848e-05,
"loss": 2.6027,
"step": 235
},
{
"epoch": 4.55,
"grad_norm": 0.486328125,
"learning_rate": 7.88486614132181e-05,
"loss": 2.6139,
"step": 236
},
{
"epoch": 4.57,
"grad_norm": 0.51953125,
"learning_rate": 7.807784231523089e-05,
"loss": 2.7281,
"step": 237
},
{
"epoch": 4.59,
"grad_norm": 0.578125,
"learning_rate": 7.730838910384097e-05,
"loss": 2.7225,
"step": 238
},
{
"epoch": 4.61,
"grad_norm": 0.5234375,
"learning_rate": 7.654034972076314e-05,
"loss": 2.6199,
"step": 239
},
{
"epoch": 4.63,
"grad_norm": 0.5,
"learning_rate": 7.57737720196217e-05,
"loss": 2.721,
"step": 240
},
{
"epoch": 4.65,
"grad_norm": 0.515625,
"learning_rate": 7.500870376296918e-05,
"loss": 2.6753,
"step": 241
},
{
"epoch": 4.67,
"grad_norm": 0.515625,
"learning_rate": 7.424519261931036e-05,
"loss": 2.6821,
"step": 242
},
{
"epoch": 4.69,
"grad_norm": 0.4921875,
"learning_rate": 7.348328616013213e-05,
"loss": 2.6376,
"step": 243
},
{
"epoch": 4.71,
"grad_norm": 0.5078125,
"learning_rate": 7.27230318569397e-05,
"loss": 2.8049,
"step": 244
},
{
"epoch": 4.73,
"grad_norm": 0.5546875,
"learning_rate": 7.196447707829857e-05,
"loss": 2.6317,
"step": 245
},
{
"epoch": 4.75,
"grad_norm": 0.5078125,
"learning_rate": 7.120766908688336e-05,
"loss": 2.6795,
"step": 246
},
{
"epoch": 4.76,
"grad_norm": 0.5,
"learning_rate": 7.045265503653303e-05,
"loss": 2.752,
"step": 247
},
{
"epoch": 4.78,
"grad_norm": 0.5234375,
"learning_rate": 6.969948196931272e-05,
"loss": 2.6873,
"step": 248
},
{
"epoch": 4.8,
"grad_norm": 0.50390625,
"learning_rate": 6.894819681258312e-05,
"loss": 2.5891,
"step": 249
},
{
"epoch": 4.82,
"grad_norm": 0.5234375,
"learning_rate": 6.819884637607619e-05,
"loss": 2.6957,
"step": 250
},
{
"epoch": 4.84,
"grad_norm": 0.53515625,
"learning_rate": 6.745147734897883e-05,
"loss": 2.6414,
"step": 251
},
{
"epoch": 4.86,
"grad_norm": 0.51171875,
"learning_rate": 6.670613629702391e-05,
"loss": 2.6997,
"step": 252
},
{
"epoch": 4.88,
"grad_norm": 0.51171875,
"learning_rate": 6.596286965958872e-05,
"loss": 2.6878,
"step": 253
},
{
"epoch": 4.9,
"grad_norm": 0.56640625,
"learning_rate": 6.522172374680177e-05,
"loss": 2.7198,
"step": 254
},
{
"epoch": 4.92,
"grad_norm": 0.5078125,
"learning_rate": 6.448274473665717e-05,
"loss": 2.7487,
"step": 255
},
{
"epoch": 4.94,
"grad_norm": 0.5,
"learning_rate": 6.374597867213756e-05,
"loss": 2.7132,
"step": 256
},
{
"epoch": 4.96,
"grad_norm": 0.546875,
"learning_rate": 6.301147145834534e-05,
"loss": 2.7664,
"step": 257
},
{
"epoch": 4.98,
"grad_norm": 0.49609375,
"learning_rate": 6.22792688596424e-05,
"loss": 2.7209,
"step": 258
},
{
"epoch": 5.0,
"grad_norm": 0.50390625,
"learning_rate": 6.154941649679894e-05,
"loss": 2.7295,
"step": 259
},
{
"epoch": 5.02,
"grad_norm": 0.5,
"learning_rate": 6.0821959844150687e-05,
"loss": 2.7088,
"step": 260
},
{
"epoch": 5.02,
"grad_norm": 0.51953125,
"learning_rate": 6.00969442267659e-05,
"loss": 2.5827,
"step": 261
},
{
"epoch": 5.04,
"grad_norm": 0.5234375,
"learning_rate": 5.9374414817621114e-05,
"loss": 2.5989,
"step": 262
},
{
"epoch": 5.06,
"grad_norm": 0.5703125,
"learning_rate": 5.8654416634786605e-05,
"loss": 2.6026,
"step": 263
},
{
"epoch": 5.08,
"grad_norm": 0.53515625,
"learning_rate": 5.7936994538621605e-05,
"loss": 2.5145,
"step": 264
},
{
"epoch": 5.1,
"grad_norm": 0.51953125,
"learning_rate": 5.7222193228979037e-05,
"loss": 2.464,
"step": 265
},
{
"epoch": 5.12,
"grad_norm": 0.6328125,
"learning_rate": 5.651005724242071e-05,
"loss": 2.5677,
"step": 266
},
{
"epoch": 5.14,
"grad_norm": 0.6640625,
"learning_rate": 5.58006309494421e-05,
"loss": 2.5876,
"step": 267
},
{
"epoch": 5.16,
"grad_norm": 0.609375,
"learning_rate": 5.509395855170798e-05,
"loss": 2.566,
"step": 268
},
{
"epoch": 5.18,
"grad_norm": 0.640625,
"learning_rate": 5.43900840792984e-05,
"loss": 2.5722,
"step": 269
},
{
"epoch": 5.2,
"grad_norm": 0.6015625,
"learning_rate": 5.368905138796523e-05,
"loss": 2.5799,
"step": 270
},
{
"epoch": 5.22,
"grad_norm": 0.578125,
"learning_rate": 5.2990904156399726e-05,
"loss": 2.5271,
"step": 271
},
{
"epoch": 5.24,
"grad_norm": 0.578125,
"learning_rate": 5.229568588351108e-05,
"loss": 2.5608,
"step": 272
},
{
"epoch": 5.25,
"grad_norm": 0.68359375,
"learning_rate": 5.160343988571613e-05,
"loss": 2.4864,
"step": 273
},
{
"epoch": 5.27,
"grad_norm": 0.65625,
"learning_rate": 5.0914209294240644e-05,
"loss": 2.6039,
"step": 274
},
{
"epoch": 5.29,
"grad_norm": 0.6328125,
"learning_rate": 5.022803705243169e-05,
"loss": 2.6246,
"step": 275
},
{
"epoch": 5.31,
"grad_norm": 0.5390625,
"learning_rate": 4.9544965913082264e-05,
"loss": 2.5739,
"step": 276
},
{
"epoch": 5.33,
"grad_norm": 0.65234375,
"learning_rate": 4.886503843576735e-05,
"loss": 2.6554,
"step": 277
},
{
"epoch": 5.35,
"grad_norm": 0.609375,
"learning_rate": 4.818829698419225e-05,
"loss": 2.5539,
"step": 278
},
{
"epoch": 5.37,
"grad_norm": 0.66796875,
"learning_rate": 4.751478372355317e-05,
"loss": 2.512,
"step": 279
},
{
"epoch": 5.39,
"grad_norm": 0.58984375,
"learning_rate": 4.684454061790987e-05,
"loss": 2.5418,
"step": 280
},
{
"epoch": 5.41,
"grad_norm": 0.6171875,
"learning_rate": 4.617760942757117e-05,
"loss": 2.5095,
"step": 281
},
{
"epoch": 5.43,
"grad_norm": 0.65625,
"learning_rate": 4.5514031706492986e-05,
"loss": 2.5265,
"step": 282
},
{
"epoch": 5.45,
"grad_norm": 0.62109375,
"learning_rate": 4.485384879968926e-05,
"loss": 2.5866,
"step": 283
},
{
"epoch": 5.47,
"grad_norm": 0.62890625,
"learning_rate": 4.4197101840655995e-05,
"loss": 2.5831,
"step": 284
},
{
"epoch": 5.49,
"grad_norm": 0.59375,
"learning_rate": 4.354383174880818e-05,
"loss": 2.5747,
"step": 285
},
{
"epoch": 5.51,
"grad_norm": 0.625,
"learning_rate": 4.289407922693053e-05,
"loss": 2.6402,
"step": 286
},
{
"epoch": 5.53,
"grad_norm": 0.61328125,
"learning_rate": 4.224788475864115e-05,
"loss": 2.6473,
"step": 287
},
{
"epoch": 5.55,
"grad_norm": 0.64453125,
"learning_rate": 4.1605288605869365e-05,
"loss": 2.4913,
"step": 288
},
{
"epoch": 5.57,
"grad_norm": 0.6015625,
"learning_rate": 4.0966330806347166e-05,
"loss": 2.5025,
"step": 289
},
{
"epoch": 5.59,
"grad_norm": 0.578125,
"learning_rate": 4.033105117111441e-05,
"loss": 2.5332,
"step": 290
},
{
"epoch": 5.61,
"grad_norm": 0.57421875,
"learning_rate": 3.969948928203856e-05,
"loss": 2.5641,
"step": 291
},
{
"epoch": 5.63,
"grad_norm": 0.609375,
"learning_rate": 3.907168448934836e-05,
"loss": 2.5393,
"step": 292
},
{
"epoch": 5.65,
"grad_norm": 0.625,
"learning_rate": 3.844767590918209e-05,
"loss": 2.5763,
"step": 293
},
{
"epoch": 5.67,
"grad_norm": 0.5546875,
"learning_rate": 3.7827502421150496e-05,
"loss": 2.5888,
"step": 294
},
{
"epoch": 5.69,
"grad_norm": 0.65234375,
"learning_rate": 3.7211202665914155e-05,
"loss": 2.6156,
"step": 295
},
{
"epoch": 5.71,
"grad_norm": 0.5625,
"learning_rate": 3.659881504277613e-05,
"loss": 2.5849,
"step": 296
},
{
"epoch": 5.73,
"grad_norm": 0.6171875,
"learning_rate": 3.599037770728929e-05,
"loss": 2.5871,
"step": 297
},
{
"epoch": 5.75,
"grad_norm": 0.55859375,
"learning_rate": 3.538592856887901e-05,
"loss": 2.662,
"step": 298
},
{
"epoch": 5.76,
"grad_norm": 0.61328125,
"learning_rate": 3.478550528848134e-05,
"loss": 2.6323,
"step": 299
},
{
"epoch": 5.78,
"grad_norm": 0.55078125,
"learning_rate": 3.4189145276196245e-05,
"loss": 2.6352,
"step": 300
},
{
"epoch": 5.8,
"grad_norm": 0.59375,
"learning_rate": 3.359688568895689e-05,
"loss": 2.5697,
"step": 301
},
{
"epoch": 5.82,
"grad_norm": 0.5703125,
"learning_rate": 3.3008763428214505e-05,
"loss": 2.5935,
"step": 302
},
{
"epoch": 5.84,
"grad_norm": 0.62109375,
"learning_rate": 3.242481513763913e-05,
"loss": 2.5634,
"step": 303
},
{
"epoch": 5.86,
"grad_norm": 0.5859375,
"learning_rate": 3.1845077200836636e-05,
"loss": 2.5948,
"step": 304
},
{
"epoch": 5.88,
"grad_norm": 0.5859375,
"learning_rate": 3.126958573908156e-05,
"loss": 2.5673,
"step": 305
},
{
"epoch": 5.9,
"grad_norm": 0.6484375,
"learning_rate": 3.0698376609066825e-05,
"loss": 2.5718,
"step": 306
},
{
"epoch": 5.92,
"grad_norm": 0.609375,
"learning_rate": 3.0131485400669356e-05,
"loss": 2.5712,
"step": 307
},
{
"epoch": 5.94,
"grad_norm": 0.58203125,
"learning_rate": 2.9568947434732775e-05,
"loss": 2.5854,
"step": 308
},
{
"epoch": 5.96,
"grad_norm": 0.6796875,
"learning_rate": 2.9010797760866737e-05,
"loss": 2.5602,
"step": 309
},
{
"epoch": 5.98,
"grad_norm": 0.59765625,
"learning_rate": 2.8457071155262884e-05,
"loss": 2.587,
"step": 310
},
{
"epoch": 6.0,
"grad_norm": 0.61328125,
"learning_rate": 2.7907802118528383e-05,
"loss": 2.608,
"step": 311
},
{
"epoch": 6.0,
"grad_norm": 0.80859375,
"learning_rate": 2.736302487353609e-05,
"loss": 2.5485,
"step": 312
},
{
"epoch": 6.02,
"grad_norm": 0.578125,
"learning_rate": 2.682277336329233e-05,
"loss": 2.498,
"step": 313
},
{
"epoch": 6.04,
"grad_norm": 0.6171875,
"learning_rate": 2.628708124882212e-05,
"loss": 2.5104,
"step": 314
},
{
"epoch": 6.06,
"grad_norm": 0.62109375,
"learning_rate": 2.575598190707168e-05,
"loss": 2.4751,
"step": 315
},
{
"epoch": 6.08,
"grad_norm": 0.578125,
"learning_rate": 2.5229508428829096e-05,
"loss": 2.512,
"step": 316
},
{
"epoch": 6.1,
"grad_norm": 0.61328125,
"learning_rate": 2.4707693616662308e-05,
"loss": 2.4899,
"step": 317
},
{
"epoch": 6.12,
"grad_norm": 0.5859375,
"learning_rate": 2.4190569982875467e-05,
"loss": 2.4877,
"step": 318
},
{
"epoch": 6.14,
"grad_norm": 0.6328125,
"learning_rate": 2.367816974748317e-05,
"loss": 2.482,
"step": 319
},
{
"epoch": 6.16,
"grad_norm": 0.65234375,
"learning_rate": 2.3170524836202933e-05,
"loss": 2.4958,
"step": 320
},
{
"epoch": 6.18,
"grad_norm": 0.62890625,
"learning_rate": 2.266766687846611e-05,
"loss": 2.4936,
"step": 321
},
{
"epoch": 6.2,
"grad_norm": 0.6796875,
"learning_rate": 2.216962720544703e-05,
"loss": 2.5029,
"step": 322
},
{
"epoch": 6.22,
"grad_norm": 0.67578125,
"learning_rate": 2.167643684811096e-05,
"loss": 2.4946,
"step": 323
},
{
"epoch": 6.24,
"grad_norm": 0.6796875,
"learning_rate": 2.1188126535280773e-05,
"loss": 2.4766,
"step": 324
},
{
"epoch": 6.26,
"grad_norm": 0.64453125,
"learning_rate": 2.070472669172213e-05,
"loss": 2.4435,
"step": 325
},
{
"epoch": 6.28,
"grad_norm": 0.6640625,
"learning_rate": 2.022626743624807e-05,
"loss": 2.5658,
"step": 326
},
{
"epoch": 6.3,
"grad_norm": 0.60546875,
"learning_rate": 1.9752778579842213e-05,
"loss": 2.514,
"step": 327
},
{
"epoch": 6.32,
"grad_norm": 0.6328125,
"learning_rate": 1.9284289623801477e-05,
"loss": 2.4731,
"step": 328
},
{
"epoch": 6.34,
"grad_norm": 0.62109375,
"learning_rate": 1.882082975789795e-05,
"loss": 2.4501,
"step": 329
},
{
"epoch": 6.36,
"grad_norm": 0.609375,
"learning_rate": 1.8362427858560093e-05,
"loss": 2.531,
"step": 330
},
{
"epoch": 6.38,
"grad_norm": 0.625,
"learning_rate": 1.7909112487073754e-05,
"loss": 2.5093,
"step": 331
},
{
"epoch": 6.4,
"grad_norm": 0.65625,
"learning_rate": 1.74609118878024e-05,
"loss": 2.4216,
"step": 332
},
{
"epoch": 6.42,
"grad_norm": 0.6171875,
"learning_rate": 1.7017853986427425e-05,
"loss": 2.5315,
"step": 333
},
{
"epoch": 6.44,
"grad_norm": 0.61328125,
"learning_rate": 1.657996638820826e-05,
"loss": 2.5197,
"step": 334
},
{
"epoch": 6.46,
"grad_norm": 0.59765625,
"learning_rate": 1.6147276376262255e-05,
"loss": 2.3946,
"step": 335
},
{
"epoch": 6.48,
"grad_norm": 0.6328125,
"learning_rate": 1.5719810909864942e-05,
"loss": 2.5408,
"step": 336
},
{
"epoch": 6.5,
"grad_norm": 0.609375,
"learning_rate": 1.5297596622770115e-05,
"loss": 2.4942,
"step": 337
},
{
"epoch": 6.51,
"grad_norm": 0.63671875,
"learning_rate": 1.4880659821550546e-05,
"loss": 2.434,
"step": 338
},
{
"epoch": 6.53,
"grad_norm": 0.62890625,
"learning_rate": 1.4469026483958837e-05,
"loss": 2.5376,
"step": 339
},
{
"epoch": 6.55,
"grad_norm": 0.6015625,
"learning_rate": 1.4062722257308803e-05,
"loss": 2.5387,
"step": 340
},
{
"epoch": 6.57,
"grad_norm": 0.61328125,
"learning_rate": 1.3661772456877675e-05,
"loss": 2.4496,
"step": 341
},
{
"epoch": 6.59,
"grad_norm": 0.63671875,
"learning_rate": 1.3266202064328548e-05,
"loss": 2.5007,
"step": 342
},
{
"epoch": 6.61,
"grad_norm": 0.6640625,
"learning_rate": 1.2876035726154045e-05,
"loss": 2.4802,
"step": 343
},
{
"epoch": 6.63,
"grad_norm": 0.6328125,
"learning_rate": 1.2491297752140641e-05,
"loss": 2.5287,
"step": 344
},
{
"epoch": 6.65,
"grad_norm": 0.66015625,
"learning_rate": 1.2112012113853954e-05,
"loss": 2.4877,
"step": 345
},
{
"epoch": 6.67,
"grad_norm": 0.6875,
"learning_rate": 1.1738202443145308e-05,
"loss": 2.4865,
"step": 346
},
{
"epoch": 6.69,
"grad_norm": 0.6953125,
"learning_rate": 1.1369892030679141e-05,
"loss": 2.4497,
"step": 347
},
{
"epoch": 6.71,
"grad_norm": 0.66015625,
"learning_rate": 1.1007103824481979e-05,
"loss": 2.486,
"step": 348
},
{
"epoch": 6.73,
"grad_norm": 0.671875,
"learning_rate": 1.0649860428512604e-05,
"loss": 2.5043,
"step": 349
},
{
"epoch": 6.75,
"grad_norm": 0.6328125,
"learning_rate": 1.029818410125365e-05,
"loss": 2.4816,
"step": 350
},
{
"epoch": 6.77,
"grad_norm": 0.64453125,
"learning_rate": 9.952096754324847e-06,
"loss": 2.5321,
"step": 351
},
{
"epoch": 6.79,
"grad_norm": 0.703125,
"learning_rate": 9.611619951117657e-06,
"loss": 2.5113,
"step": 352
},
{
"epoch": 6.81,
"grad_norm": 0.63671875,
"learning_rate": 9.276774905451869e-06,
"loss": 2.4642,
"step": 353
},
{
"epoch": 6.83,
"grad_norm": 0.640625,
"learning_rate": 8.94758248025378e-06,
"loss": 2.5338,
"step": 354
},
{
"epoch": 6.85,
"grad_norm": 0.609375,
"learning_rate": 8.624063186256326e-06,
"loss": 2.4683,
"step": 355
},
{
"epoch": 6.87,
"grad_norm": 0.65625,
"learning_rate": 8.306237180721121e-06,
"loss": 2.4918,
"step": 356
},
{
"epoch": 6.89,
"grad_norm": 0.67578125,
"learning_rate": 7.994124266182568e-06,
"loss": 2.4688,
"step": 357
},
{
"epoch": 6.91,
"grad_norm": 0.6328125,
"learning_rate": 7.687743889213938e-06,
"loss": 2.4914,
"step": 358
},
{
"epoch": 6.93,
"grad_norm": 0.60546875,
"learning_rate": 7.387115139215894e-06,
"loss": 2.4586,
"step": 359
},
{
"epoch": 6.95,
"grad_norm": 0.62890625,
"learning_rate": 7.0922567472269444e-06,
"loss": 2.4991,
"step": 360
},
{
"epoch": 6.97,
"grad_norm": 0.625,
"learning_rate": 6.803187084756524e-06,
"loss": 2.5431,
"step": 361
},
{
"epoch": 6.99,
"grad_norm": 0.6171875,
"learning_rate": 6.519924162640167e-06,
"loss": 2.4695,
"step": 362
},
{
"epoch": 7.0,
"grad_norm": 0.625,
"learning_rate": 6.242485629917494e-06,
"loss": 2.475,
"step": 363
},
{
"epoch": 7.01,
"grad_norm": 0.71484375,
"learning_rate": 5.9708887727324525e-06,
"loss": 2.4779,
"step": 364
},
{
"epoch": 7.03,
"grad_norm": 0.59375,
"learning_rate": 5.7051505132562965e-06,
"loss": 2.5005,
"step": 365
},
{
"epoch": 7.05,
"grad_norm": 0.5859375,
"learning_rate": 5.445287408633304e-06,
"loss": 2.4736,
"step": 366
},
{
"epoch": 7.07,
"grad_norm": 0.59375,
"learning_rate": 5.191315649949047e-06,
"loss": 2.4719,
"step": 367
},
{
"epoch": 7.09,
"grad_norm": 0.62890625,
"learning_rate": 4.943251061221721e-06,
"loss": 2.5049,
"step": 368
},
{
"epoch": 7.11,
"grad_norm": 0.640625,
"learning_rate": 4.701109098416079e-06,
"loss": 2.507,
"step": 369
},
{
"epoch": 7.13,
"grad_norm": 0.640625,
"learning_rate": 4.464904848480523e-06,
"loss": 2.4379,
"step": 370
},
{
"epoch": 7.15,
"grad_norm": 0.61328125,
"learning_rate": 4.234653028407054e-06,
"loss": 2.4399,
"step": 371
},
{
"epoch": 7.17,
"grad_norm": 0.60546875,
"learning_rate": 4.0103679843142895e-06,
"loss": 2.4656,
"step": 372
},
{
"epoch": 7.19,
"grad_norm": 0.6484375,
"learning_rate": 3.7920636905537155e-06,
"loss": 2.493,
"step": 373
},
{
"epoch": 7.21,
"grad_norm": 0.640625,
"learning_rate": 3.5797537488388323e-06,
"loss": 2.4273,
"step": 374
},
{
"epoch": 7.23,
"grad_norm": 0.62109375,
"learning_rate": 3.373451387397819e-06,
"loss": 2.4495,
"step": 375
},
{
"epoch": 7.25,
"grad_norm": 0.63671875,
"learning_rate": 3.1731694601492833e-06,
"loss": 2.4133,
"step": 376
},
{
"epoch": 7.27,
"grad_norm": 0.60546875,
"learning_rate": 2.9789204459013785e-06,
"loss": 2.4453,
"step": 377
},
{
"epoch": 7.29,
"grad_norm": 0.7265625,
"learning_rate": 2.7907164475743043e-06,
"loss": 2.4051,
"step": 378
},
{
"epoch": 7.31,
"grad_norm": 0.62890625,
"learning_rate": 2.6085691914462306e-06,
"loss": 2.4648,
"step": 379
},
{
"epoch": 7.33,
"grad_norm": 0.59765625,
"learning_rate": 2.4324900264226403e-06,
"loss": 2.5328,
"step": 380
},
{
"epoch": 7.35,
"grad_norm": 0.6171875,
"learning_rate": 2.2624899233292806e-06,
"loss": 2.4942,
"step": 381
},
{
"epoch": 7.37,
"grad_norm": 0.60546875,
"learning_rate": 2.098579474228546e-06,
"loss": 2.402,
"step": 382
},
{
"epoch": 7.39,
"grad_norm": 0.640625,
"learning_rate": 1.9407688917595925e-06,
"loss": 2.4559,
"step": 383
},
{
"epoch": 7.41,
"grad_norm": 0.6328125,
"learning_rate": 1.7890680085019595e-06,
"loss": 2.47,
"step": 384
},
{
"epoch": 7.43,
"grad_norm": 0.62890625,
"learning_rate": 1.6434862763630155e-06,
"loss": 2.5025,
"step": 385
},
{
"epoch": 7.45,
"grad_norm": 0.63671875,
"learning_rate": 1.5040327659889608e-06,
"loss": 2.4876,
"step": 386
},
{
"epoch": 7.47,
"grad_norm": 0.640625,
"learning_rate": 1.370716166199726e-06,
"loss": 2.4821,
"step": 387
},
{
"epoch": 7.49,
"grad_norm": 0.609375,
"learning_rate": 1.2435447834476255e-06,
"loss": 2.5129,
"step": 388
},
{
"epoch": 7.5,
"grad_norm": 0.66015625,
"learning_rate": 1.122526541299751e-06,
"loss": 2.5025,
"step": 389
},
{
"epoch": 7.52,
"grad_norm": 0.640625,
"learning_rate": 1.0076689799442873e-06,
"loss": 2.4836,
"step": 390
},
{
"epoch": 7.54,
"grad_norm": 0.60546875,
"learning_rate": 8.989792557207887e-07,
"loss": 2.5049,
"step": 391
},
{
"epoch": 7.56,
"grad_norm": 0.6328125,
"learning_rate": 7.964641406742135e-07,
"loss": 2.4673,
"step": 392
},
{
"epoch": 7.58,
"grad_norm": 0.6171875,
"learning_rate": 7.001300221330387e-07,
"loss": 2.4203,
"step": 393
},
{
"epoch": 7.6,
"grad_norm": 0.66015625,
"learning_rate": 6.099829023112235e-07,
"loss": 2.4317,
"step": 394
},
{
"epoch": 7.62,
"grad_norm": 0.63671875,
"learning_rate": 5.260283979343084e-07,
"loss": 2.4273,
"step": 395
},
{
"epoch": 7.64,
"grad_norm": 0.63671875,
"learning_rate": 4.482717398894165e-07,
"loss": 2.539,
"step": 396
},
{
"epoch": 7.66,
"grad_norm": 0.63671875,
"learning_rate": 3.767177728993265e-07,
"loss": 2.4855,
"step": 397
},
{
"epoch": 7.68,
"grad_norm": 0.6640625,
"learning_rate": 3.1137095522068007e-07,
"loss": 2.525,
"step": 398
},
{
"epoch": 7.7,
"grad_norm": 0.61328125,
"learning_rate": 2.522353583661263e-07,
"loss": 2.3878,
"step": 399
},
{
"epoch": 7.72,
"grad_norm": 0.609375,
"learning_rate": 1.9931466685065847e-07,
"loss": 2.4339,
"step": 400
},
{
"epoch": 7.74,
"grad_norm": 0.59765625,
"learning_rate": 1.5261217796211923e-07,
"loss": 2.4491,
"step": 401
},
{
"epoch": 7.76,
"grad_norm": 0.64453125,
"learning_rate": 1.1213080155564326e-07,
"loss": 2.4598,
"step": 402
},
{
"epoch": 7.78,
"grad_norm": 0.68359375,
"learning_rate": 7.787305987243532e-08,
"loss": 2.4383,
"step": 403
},
{
"epoch": 7.8,
"grad_norm": 0.62890625,
"learning_rate": 4.9841087382618276e-08,
"loss": 2.4653,
"step": 404
},
{
"epoch": 7.82,
"grad_norm": 0.60546875,
"learning_rate": 2.8036630652206187e-08,
"loss": 2.4291,
"step": 405
},
{
"epoch": 7.84,
"grad_norm": 0.59375,
"learning_rate": 1.2461048234269079e-08,
"loss": 2.5025,
"step": 406
},
{
"epoch": 7.86,
"grad_norm": 0.625,
"learning_rate": 3.115310584367315e-09,
"loss": 2.4367,
"step": 407
},
{
"epoch": 7.88,
"grad_norm": 0.59765625,
"learning_rate": 0.0,
"loss": 2.5149,
"step": 408
}
],
"logging_steps": 1,
"max_steps": 408,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 51,
"total_flos": 6.078120159011144e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}