{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0033222591362125,
  "eval_steps": 57,
  "global_step": 226,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.004429678848283499,
      "eval_loss": 2.5983426570892334,
      "eval_runtime": 5.0405,
      "eval_samples_per_second": 18.847,
      "eval_steps_per_second": 9.523,
      "step": 1
    },
    {
      "epoch": 0.0221483942414175,
      "grad_norm": 1.4756156206130981,
      "learning_rate": 1.6666666666666667e-05,
      "loss": 1.9339,
      "step": 5
    },
    {
      "epoch": 0.044296788482835,
      "grad_norm": 1.9938647747039795,
      "learning_rate": 3.3333333333333335e-05,
      "loss": 2.2285,
      "step": 10
    },
    {
      "epoch": 0.0664451827242525,
      "grad_norm": 2.0396549701690674,
      "learning_rate": 5e-05,
      "loss": 2.5228,
      "step": 15
    },
    {
      "epoch": 0.08859357696567,
      "grad_norm": 1.985188603401184,
      "learning_rate": 6.666666666666667e-05,
      "loss": 1.8434,
      "step": 20
    },
    {
      "epoch": 0.11074197120708748,
      "grad_norm": 2.442502975463867,
      "learning_rate": 8.333333333333334e-05,
      "loss": 1.5617,
      "step": 25
    },
    {
      "epoch": 0.132890365448505,
      "grad_norm": 2.1058413982391357,
      "learning_rate": 0.0001,
      "loss": 1.4216,
      "step": 30
    },
    {
      "epoch": 0.15503875968992248,
      "grad_norm": 2.086914300918579,
      "learning_rate": 9.983951473748578e-05,
      "loss": 1.2727,
      "step": 35
    },
    {
      "epoch": 0.17718715393134,
      "grad_norm": 1.712249994277954,
      "learning_rate": 9.935908917072252e-05,
      "loss": 1.4231,
      "step": 40
    },
    {
      "epoch": 0.19933554817275748,
      "grad_norm": 2.7731285095214844,
      "learning_rate": 9.85618073486382e-05,
      "loss": 1.482,
      "step": 45
    },
    {
      "epoch": 0.22148394241417496,
      "grad_norm": 5.423552989959717,
      "learning_rate": 9.745278735053343e-05,
      "loss": 2.1623,
      "step": 50
    },
    {
      "epoch": 0.24363233665559247,
      "grad_norm": 1.5658822059631348,
      "learning_rate": 9.603914843102941e-05,
      "loss": 0.7675,
      "step": 55
    },
    {
      "epoch": 0.25249169435215946,
      "eval_loss": 1.1176186800003052,
      "eval_runtime": 5.0421,
      "eval_samples_per_second": 18.841,
      "eval_steps_per_second": 9.52,
      "step": 57
    },
    {
      "epoch": 0.26578073089701,
      "grad_norm": 1.5977628231048584,
      "learning_rate": 9.432996531865002e-05,
      "loss": 0.9452,
      "step": 60
    },
    {
      "epoch": 0.28792912513842744,
      "grad_norm": 1.3801822662353516,
      "learning_rate": 9.233620996141421e-05,
      "loss": 1.0662,
      "step": 65
    },
    {
      "epoch": 0.31007751937984496,
      "grad_norm": 1.4718172550201416,
      "learning_rate": 9.007068109339784e-05,
      "loss": 1.021,
      "step": 70
    },
    {
      "epoch": 0.33222591362126247,
      "grad_norm": 1.4416835308074951,
      "learning_rate": 8.754792207440557e-05,
      "loss": 1.1909,
      "step": 75
    },
    {
      "epoch": 0.35437430786268,
      "grad_norm": 1.3599156141281128,
      "learning_rate": 8.478412753017433e-05,
      "loss": 1.0854,
      "step": 80
    },
    {
      "epoch": 0.37652270210409744,
      "grad_norm": 1.7577108144760132,
      "learning_rate": 8.179703939242276e-05,
      "loss": 0.9214,
      "step": 85
    },
    {
      "epoch": 0.39867109634551495,
      "grad_norm": 1.8137191534042358,
      "learning_rate": 7.860583300610849e-05,
      "loss": 1.1752,
      "step": 90
    },
    {
      "epoch": 0.42081949058693247,
      "grad_norm": 1.6590023040771484,
      "learning_rate": 7.52309940350173e-05,
      "loss": 1.2202,
      "step": 95
    },
    {
      "epoch": 0.4429678848283499,
      "grad_norm": 2.184537410736084,
      "learning_rate": 7.169418695587791e-05,
      "loss": 1.8343,
      "step": 100
    },
    {
      "epoch": 0.46511627906976744,
      "grad_norm": 1.2622162103652954,
      "learning_rate": 6.801811598519268e-05,
      "loss": 0.8566,
      "step": 105
    },
    {
      "epoch": 0.48726467331118495,
      "grad_norm": 1.2197306156158447,
      "learning_rate": 6.422637933155162e-05,
      "loss": 1.1106,
      "step": 110
    },
    {
      "epoch": 0.5049833887043189,
      "eval_loss": 1.0898905992507935,
      "eval_runtime": 5.0519,
      "eval_samples_per_second": 18.805,
      "eval_steps_per_second": 9.501,
      "step": 114
    },
    {
      "epoch": 0.5094130675526024,
      "grad_norm": 1.1158766746520996,
      "learning_rate": 6.0343317709044546e-05,
      "loss": 1.0906,
      "step": 115
    },
    {
      "epoch": 0.53156146179402,
      "grad_norm": 1.4978259801864624,
      "learning_rate": 5.6393858084225305e-05,
      "loss": 0.8869,
      "step": 120
    },
    {
      "epoch": 0.5537098560354374,
      "grad_norm": 1.5120636224746704,
      "learning_rate": 5.240335365968104e-05,
      "loss": 1.1671,
      "step": 125
    },
    {
      "epoch": 0.5758582502768549,
      "grad_norm": 1.4071290493011475,
      "learning_rate": 4.839742112141724e-05,
      "loss": 1.2143,
      "step": 130
    },
    {
      "epoch": 0.5980066445182725,
      "grad_norm": 1.576568365097046,
      "learning_rate": 4.4401776194834613e-05,
      "loss": 1.0833,
      "step": 135
    },
    {
      "epoch": 0.6201550387596899,
      "grad_norm": 1.2496248483657837,
      "learning_rate": 4.04420685649314e-05,
      "loss": 1.1847,
      "step": 140
    },
    {
      "epoch": 0.6423034330011074,
      "grad_norm": 1.3625198602676392,
      "learning_rate": 3.654371722044616e-05,
      "loss": 1.067,
      "step": 145
    },
    {
      "epoch": 0.6644518272425249,
      "grad_norm": 2.972466230392456,
      "learning_rate": 3.273174727893463e-05,
      "loss": 1.7863,
      "step": 150
    },
    {
      "epoch": 0.6866002214839424,
      "grad_norm": 0.9294478297233582,
      "learning_rate": 2.9030629340267164e-05,
      "loss": 0.9216,
      "step": 155
    },
    {
      "epoch": 0.70874861572536,
      "grad_norm": 1.618067979812622,
      "learning_rate": 2.5464122399803125e-05,
      "loss": 1.0288,
      "step": 160
    },
    {
      "epoch": 0.7308970099667774,
      "grad_norm": 1.3609340190887451,
      "learning_rate": 2.2055121329646418e-05,
      "loss": 0.9452,
      "step": 165
    },
    {
      "epoch": 0.7530454042081949,
      "grad_norm": 1.2449376583099365,
      "learning_rate": 1.8825509907063327e-05,
      "loss": 0.8706,
      "step": 170
    },
    {
      "epoch": 0.7574750830564784,
      "eval_loss": 1.0665383338928223,
      "eval_runtime": 5.0548,
      "eval_samples_per_second": 18.794,
      "eval_steps_per_second": 9.496,
      "step": 171
    },
    {
      "epoch": 0.7751937984496124,
      "grad_norm": 1.1113590002059937,
      "learning_rate": 1.5796020333532695e-05,
      "loss": 1.0823,
      "step": 175
    },
    {
      "epoch": 0.7973421926910299,
      "grad_norm": 1.1141512393951416,
      "learning_rate": 1.2986100146234232e-05,
      "loss": 0.8785,
      "step": 180
    },
    {
      "epoch": 0.8194905869324474,
      "grad_norm": 1.8312709331512451,
      "learning_rate": 1.0413787376324019e-05,
      "loss": 1.4858,
      "step": 185
    },
    {
      "epoch": 0.8416389811738649,
      "grad_norm": 1.527754545211792,
      "learning_rate": 8.09559475540797e-06,
      "loss": 1.3716,
      "step": 190
    },
    {
      "epoch": 0.8637873754152824,
      "grad_norm": 2.084320545196533,
      "learning_rate": 6.0464037135391395e-06,
      "loss": 1.3418,
      "step": 195
    },
    {
      "epoch": 0.8859357696566998,
      "grad_norm": 2.9202771186828613,
      "learning_rate": 4.279368849209381e-06,
      "loss": 1.5893,
      "step": 200
    },
    {
      "epoch": 0.9080841638981174,
      "grad_norm": 1.439396619796753,
      "learning_rate": 2.8058334845816213e-06,
      "loss": 0.9657,
      "step": 205
    },
    {
      "epoch": 0.9302325581395349,
      "grad_norm": 1.4320131540298462,
      "learning_rate": 1.6352568480485276e-06,
      "loss": 1.0156,
      "step": 210
    },
    {
      "epoch": 0.9523809523809523,
      "grad_norm": 1.5984946489334106,
      "learning_rate": 7.751533515623799e-07,
      "loss": 1.0568,
      "step": 215
    },
    {
      "epoch": 0.9745293466223699,
      "grad_norm": 1.2688162326812744,
      "learning_rate": 2.310443525400885e-07,
      "loss": 0.9799,
      "step": 220
    },
    {
      "epoch": 0.9966777408637874,
      "grad_norm": 2.5447208881378174,
      "learning_rate": 6.422710003439747e-09,
      "loss": 1.5954,
      "step": 225
    }
  ],
  "logging_steps": 5,
  "max_steps": 226,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 3.098941995142349e+16,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}