AliHmlii
/

zephyr-7b-sft-qlora

@@ -36,14 +36,14 @@ More information needed
 The following hyperparameters were used during training:
 - learning_rate: 0.0002
-- train_batch_size: 2
 - eval_batch_size: 8
 - seed: 42
 - distributed_type: multi-GPU
-- num_devices: 4
 - gradient_accumulation_steps: 2
 - total_train_batch_size: 16
-- total_eval_batch_size: 32
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
@@ -55,8 +55,8 @@ The following hyperparameters were used during training:
 ### Framework versions
-- PEFT 0.11.1
-- Transformers 4.39.3
-- Pytorch 2.3.1+cu121
-- Datasets 2.20.0
-- Tokenizers 0.15.2

 The following hyperparameters were used during training:
 - learning_rate: 0.0002
+- train_batch_size: 4
 - eval_batch_size: 8
 - seed: 42
 - distributed_type: multi-GPU
+- num_devices: 2
 - gradient_accumulation_steps: 2
 - total_train_batch_size: 16
+- total_eval_batch_size: 16
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
 ### Framework versions
+- PEFT 0.13.1
+- Transformers 4.45.2
+- Pytorch 2.4.1+cu121
+- Datasets 3.0.1
+- Tokenizers 0.20.0

all_results.json CHANGED Viewed

@@ -1,8 +1,9 @@
 {
     "epoch": 1.0,
-    "train_loss": 0.6069550340240066,
-    "train_runtime": 2548.2523,
-    "train_samples": 5919,
-    "train_samples_per_second": 2.323,
-    "train_steps_per_second": 0.145
 }

 {
     "epoch": 1.0,
+    "total_flos": 1.025624031790891e+18,
+    "train_loss": 0.004042043934228078,
+    "train_runtime": 6129.3476,
+    "train_samples": 18448,
+    "train_samples_per_second": 3.01,
+    "train_steps_per_second": 0.188
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,9 @@
 {
     "epoch": 1.0,
-    "train_loss": 0.6069550340240066,
-    "train_runtime": 2548.2523,
-    "train_samples": 5919,
-    "train_samples_per_second": 2.323,
-    "train_steps_per_second": 0.145
 }

 {
     "epoch": 1.0,
+    "total_flos": 1.025624031790891e+18,
+    "train_loss": 0.004042043934228078,
+    "train_runtime": 6129.3476,
+    "train_samples": 18448,
+    "train_samples_per_second": 3.01,
+    "train_steps_per_second": 0.188
 }

trainer_state.json CHANGED Viewed

@@ -3,553 +3,1657 @@
   "best_model_checkpoint": null,
   "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 370,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.0,
-      "grad_norm": 1.875,
-      "learning_rate": 5.405405405405406e-06,
-      "loss": 1.9822,
       "step": 1
     },
     {
-      "epoch": 0.01,
-      "grad_norm": 1.640625,
-      "learning_rate": 2.702702702702703e-05,
-      "loss": 1.9317,
       "step": 5
     },
     {
-      "epoch": 0.03,
-      "grad_norm": 1.1640625,
-      "learning_rate": 5.405405405405406e-05,
-      "loss": 1.7616,
       "step": 10
     },
     {
-      "epoch": 0.04,
-      "grad_norm": 1.5703125,
-      "learning_rate": 8.108108108108109e-05,
-      "loss": 1.3983,
       "step": 15
     },
     {
-      "epoch": 0.05,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.00010810810810810812,
-      "loss": 1.0915,
       "step": 20
     },
     {
-      "epoch": 0.07,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.00013513513513513514,
-      "loss": 0.8498,
       "step": 25
     },
     {
-      "epoch": 0.08,
-      "grad_norm": 0.50390625,
-      "learning_rate": 0.00016216216216216218,
-      "loss": 0.7893,
       "step": 30
     },
     {
-      "epoch": 0.09,
-      "grad_norm": 0.51171875,
-      "learning_rate": 0.0001891891891891892,
-      "loss": 0.7231,
       "step": 35
     },
     {
-      "epoch": 0.11,
-      "grad_norm": 0.46875,
-      "learning_rate": 0.0001999599507118322,
-      "loss": 0.7377,
       "step": 40
     },
     {
-      "epoch": 0.12,
-      "grad_norm": 0.45703125,
-      "learning_rate": 0.00019971532122280464,
-      "loss": 0.7216,
       "step": 45
     },
     {
-      "epoch": 0.14,
-      "grad_norm": 0.45703125,
-      "learning_rate": 0.0001992488554155135,
-      "loss": 0.6466,
       "step": 50
     },
     {
-      "epoch": 0.15,
-      "grad_norm": 0.431640625,
-      "learning_rate": 0.00019856159103477086,
-      "loss": 0.6405,
       "step": 55
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 0.435546875,
-      "learning_rate": 0.00019765505703518496,
-      "loss": 0.6432,
       "step": 60
     },
     {
-      "epoch": 0.18,
-      "grad_norm": 0.443359375,
-      "learning_rate": 0.00019653127017970034,
-      "loss": 0.5388,
       "step": 65
     },
     {
-      "epoch": 0.19,
-      "grad_norm": 0.427734375,
-      "learning_rate": 0.00019519273055291266,
-      "loss": 0.6291,
       "step": 70
     },
     {
-      "epoch": 0.2,
-      "grad_norm": 0.396484375,
-      "learning_rate": 0.00019364241599913924,
-      "loss": 0.6107,
       "step": 75
     },
     {
-      "epoch": 0.22,
-      "grad_norm": 0.486328125,
-      "learning_rate": 0.00019188377549761963,
-      "loss": 0.5761,
       "step": 80
     },
     {
-      "epoch": 0.23,
-      "grad_norm": 0.396484375,
-      "learning_rate": 0.00018992072148958368,
-      "loss": 0.6359,
       "step": 85
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 0.375,
-      "learning_rate": 0.00018775762117425777,
-      "loss": 0.5815,
       "step": 90
     },
     {
-      "epoch": 0.26,
-      "grad_norm": 0.42578125,
-      "learning_rate": 0.0001853992867931721,
-      "loss": 0.5828,
       "step": 95
     },
     {
-      "epoch": 0.27,
-      "grad_norm": 0.390625,
-      "learning_rate": 0.00018285096492438424,
-      "loss": 0.5562,
       "step": 100
     },
     {
-      "epoch": 0.28,
-      "grad_norm": 0.4140625,
-      "learning_rate": 0.00018011832481043576,
-      "loss": 0.5417,
       "step": 105
     },
     {
-      "epoch": 0.3,
-      "grad_norm": 0.396484375,
-      "learning_rate": 0.00017720744574600863,
-      "loss": 0.5668,
       "step": 110
     },
     {
-      "epoch": 0.31,
-      "grad_norm": 0.38671875,
-      "learning_rate": 0.00017412480355334005,
-      "loss": 0.61,
       "step": 115
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 0.37890625,
-      "learning_rate": 0.00017087725617548385,
-      "loss": 0.5436,
       "step": 120
     },
     {
-      "epoch": 0.34,
-      "grad_norm": 0.37109375,
-      "learning_rate": 0.00016747202841946928,
-      "loss": 0.5427,
       "step": 125
     },
     {
-      "epoch": 0.35,
-      "grad_norm": 0.384765625,
-      "learning_rate": 0.0001639166958832985,
-      "loss": 0.5934,
       "step": 130
     },
     {
-      "epoch": 0.36,
-      "grad_norm": 0.412109375,
-      "learning_rate": 0.00016021916810254097,
-      "loss": 0.5465,
       "step": 135
     },
     {
-      "epoch": 0.38,
-      "grad_norm": 0.40234375,
-      "learning_rate": 0.0001563876709540178,
-      "loss": 0.532,
       "step": 140
     },
     {
-      "epoch": 0.39,
-      "grad_norm": 0.390625,
-      "learning_rate": 0.00015243072835572318,
-      "loss": 0.5278,
       "step": 145
     },
     {
-      "epoch": 0.41,
-      "grad_norm": 0.33984375,
-      "learning_rate": 0.00014835714330369446,
-      "loss": 0.5204,
       "step": 150
     },
     {
-      "epoch": 0.42,
-      "grad_norm": 0.39453125,
-      "learning_rate": 0.00014417597828801832,
-      "loss": 0.5868,
       "step": 155
     },
     {
-      "epoch": 0.43,
-      "grad_norm": 0.388671875,
-      "learning_rate": 0.00013989653513154165,
-      "loss": 0.5338,
       "step": 160
     },
     {
-      "epoch": 0.45,
-      "grad_norm": 0.3828125,
-      "learning_rate": 0.00013552833429613938,
-      "loss": 0.589,
       "step": 165
     },
     {
-      "epoch": 0.46,
-      "grad_norm": 0.376953125,
-      "learning_rate": 0.00013108109370257712,
-      "loss": 0.5301,
       "step": 170
     },
     {
-      "epoch": 0.47,
-      "grad_norm": 0.390625,
-      "learning_rate": 0.00012656470711108764,
-      "loss": 0.4945,
       "step": 175
     },
     {
-      "epoch": 0.49,
-      "grad_norm": 0.40625,
-      "learning_rate": 0.00012198922211075778,
-      "loss": 0.4662,
       "step": 180
     },
     {
-      "epoch": 0.5,
-      "grad_norm": 0.423828125,
-      "learning_rate": 0.00011736481776669306,
-      "loss": 0.5302,
       "step": 185
     },
     {
-      "epoch": 0.51,
-      "grad_norm": 0.38671875,
-      "learning_rate": 0.00011270178197468789,
-      "loss": 0.5807,
       "step": 190
     },
     {
-      "epoch": 0.53,
-      "grad_norm": 0.37890625,
-      "learning_rate": 0.00010801048857378071,
-      "loss": 0.519,
       "step": 195
     },
     {
-      "epoch": 0.54,
-      "grad_norm": 0.37890625,
-      "learning_rate": 0.00010330137426761135,
-      "loss": 0.5623,
       "step": 200
     },
     {
-      "epoch": 0.55,
-      "grad_norm": 0.3828125,
-      "learning_rate": 9.858491540592382e-05,
-      "loss": 0.5507,
       "step": 205
     },
     {
-      "epoch": 0.57,
-      "grad_norm": 0.373046875,
-      "learning_rate": 9.38716046778684e-05,
-      "loss": 0.5183,
       "step": 210
     },
     {
-      "epoch": 0.58,
-      "grad_norm": 0.390625,
-      "learning_rate": 8.917192776895382e-05,
-      "loss": 0.5846,
       "step": 215
     },
     {
-      "epoch": 0.59,
-      "grad_norm": 0.373046875,
-      "learning_rate": 8.449634003358022e-05,
-      "loss": 0.5085,
       "step": 220
     },
     {
-      "epoch": 0.61,
-      "grad_norm": 0.380859375,
-      "learning_rate": 7.985524323504948e-05,
-      "loss": 0.4886,
       "step": 225
     },
     {
-      "epoch": 0.62,
-      "grad_norm": 0.359375,
-      "learning_rate": 7.525896240479976e-05,
-      "loss": 0.4739,
       "step": 230
     },
     {
-      "epoch": 0.64,
-      "grad_norm": 0.35546875,
-      "learning_rate": 7.071772287234497e-05,
-      "loss": 0.5171,
       "step": 235
     },
     {
-      "epoch": 0.65,
-      "grad_norm": 0.37109375,
-      "learning_rate": 6.624162751702076e-05,
-      "loss": 0.5222,
       "step": 240
     },
     {
-      "epoch": 0.66,
-      "grad_norm": 0.369140625,
-      "learning_rate": 6.184063429214515e-05,
-      "loss": 0.5699,
       "step": 245
     },
     {
-      "epoch": 0.68,
-      "grad_norm": 0.369140625,
-      "learning_rate": 5.752453407159522e-05,
-      "loss": 0.5155,
       "step": 250
     },
     {
-      "epoch": 0.69,
-      "grad_norm": 0.37890625,
-      "learning_rate": 5.33029288680852e-05,
-      "loss": 0.5129,
       "step": 255
     },
     {
-      "epoch": 0.7,
-      "grad_norm": 0.376953125,
-      "learning_rate": 4.918521047160308e-05,
-      "loss": 0.5324,
       "step": 260
     },
     {
-      "epoch": 0.72,
-      "grad_norm": 0.373046875,
-      "learning_rate": 4.518053955552903e-05,
-      "loss": 0.5163,
       "step": 265
     },
     {
-      "epoch": 0.73,
-      "grad_norm": 0.361328125,
-      "learning_rate": 4.129782529691815e-05,
-      "loss": 0.5355,
       "step": 270
     },
     {
-      "epoch": 0.74,
-      "grad_norm": 0.37890625,
-      "learning_rate": 3.7545705556286126e-05,
-      "loss": 0.4844,
       "step": 275
     },
     {
-      "epoch": 0.76,
-      "grad_norm": 0.37890625,
-      "learning_rate": 3.393252766099187e-05,
-      "loss": 0.5222,
       "step": 280
     },
     {
-      "epoch": 0.77,
-      "grad_norm": 0.3828125,
-      "learning_rate": 3.0466329834968233e-05,
-      "loss": 0.4567,
       "step": 285
     },
     {
-      "epoch": 0.78,
-      "grad_norm": 0.349609375,
-      "learning_rate": 2.7154823316113932e-05,
-      "loss": 0.4622,
       "step": 290
     },
     {
-      "epoch": 0.8,
-      "grad_norm": 0.35546875,
-      "learning_rate": 2.4005375201130274e-05,
-      "loss": 0.5067,
       "step": 295
     },
     {
-      "epoch": 0.81,
-      "grad_norm": 0.361328125,
-      "learning_rate": 2.102499205596743e-05,
-      "loss": 0.4906,
       "step": 300
     },
     {
-      "epoch": 0.82,
-      "grad_norm": 0.369140625,
-      "learning_rate": 1.8220304328342252e-05,
-      "loss": 0.5363,
       "step": 305
     },
     {
-      "epoch": 0.84,
-      "grad_norm": 0.390625,
-      "learning_rate": 1.5597551597004966e-05,
-      "loss": 0.5493,
       "step": 310
     },
     {
-      "epoch": 0.85,
-      "grad_norm": 0.373046875,
-      "learning_rate": 1.3162568690570743e-05,
-      "loss": 0.523,
       "step": 315
     },
     {
-      "epoch": 0.86,
-      "grad_norm": 0.37890625,
-      "learning_rate": 1.0920772706797167e-05,
-      "loss": 0.4544,
       "step": 320
     },
     {
-      "epoch": 0.88,
-      "grad_norm": 0.359375,
-      "learning_rate": 8.87715096118642e-06,
-      "loss": 0.5245,
       "step": 325
     },
     {
-      "epoch": 0.89,
-      "grad_norm": 0.361328125,
-      "learning_rate": 7.03624989172228e-06,
-      "loss": 0.5121,
       "step": 330
     },
     {
-      "epoch": 0.91,
-      "grad_norm": 0.35546875,
-      "learning_rate": 5.402164944425758e-06,
-      "loss": 0.4592,
       "step": 335
     },
     {
-      "epoch": 0.92,
-      "grad_norm": 0.390625,
-      "learning_rate": 3.9785314622310495e-06,
-      "loss": 0.4671,
       "step": 340
     },
     {
-      "epoch": 0.93,
-      "grad_norm": 0.359375,
-      "learning_rate": 2.7685165974510986e-06,
-      "loss": 0.5781,
       "step": 345
     },
     {
-      "epoch": 0.95,
-      "grad_norm": 0.37109375,
-      "learning_rate": 1.7748122658251876e-06,
-      "loss": 0.4408,
       "step": 350
     },
     {
-      "epoch": 0.96,
-      "grad_norm": 0.34765625,
-      "learning_rate": 9.996291578236228e-07,
-      "loss": 0.554,
       "step": 355
     },
     {
-      "epoch": 0.97,
-      "grad_norm": 0.396484375,
-      "learning_rate": 4.44691820532539e-07,
-      "loss": 0.5218,
       "step": 360
     },
     {
-      "epoch": 0.99,
-      "grad_norm": 0.365234375,
-      "learning_rate": 1.1123482106021322e-07,
-      "loss": 0.4716,
       "step": 365
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 0.34375,
-      "learning_rate": 0.0,
-      "loss": 0.48,
       "step": 370
     },
     {
       "epoch": 1.0,
-      "step": 370,
-      "total_flos": 7.40853344674775e+17,
-      "train_loss": 0.6069550340240066,
-      "train_runtime": 2548.2523,
-      "train_samples_per_second": 2.323,
-      "train_steps_per_second": 0.145
     }
   ],
   "logging_steps": 5,
-  "max_steps": 370,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
-  "save_steps": 50,
-  "total_flos": 7.40853344674775e+17,
-  "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null
 }

   "best_model_checkpoint": null,
   "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 1153,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.0008673026886383347,
+      "grad_norm": 0.824800431728363,
+      "learning_rate": 1.724137931034483e-06,
+      "loss": 0.2659,
       "step": 1
     },
     {
+      "epoch": 0.004336513443191674,
+      "grad_norm": 0.697605550289154,
+      "learning_rate": 8.620689655172414e-06,
+      "loss": 0.3205,
       "step": 5
     },
     {
+      "epoch": 0.008673026886383347,
+      "grad_norm": 0.5697792172431946,
+      "learning_rate": 1.7241379310344828e-05,
+      "loss": 0.2515,
       "step": 10
     },
     {
+      "epoch": 0.013009540329575022,
+      "grad_norm": 0.4874624013900757,
+      "learning_rate": 2.5862068965517244e-05,
+      "loss": 0.1831,
       "step": 15
     },
     {
+      "epoch": 0.017346053772766695,
+      "grad_norm": 0.32868602871894836,
+      "learning_rate": 3.4482758620689657e-05,
+      "loss": 0.0883,
       "step": 20
     },
     {
+      "epoch": 0.02168256721595837,
+      "grad_norm": 0.17428267002105713,
+      "learning_rate": 4.3103448275862066e-05,
+      "loss": 0.0309,
       "step": 25
     },
     {
+      "epoch": 0.026019080659150044,
+      "grad_norm": 0.07211117446422577,
+      "learning_rate": 5.172413793103449e-05,
+      "loss": 0.013,
       "step": 30
     },
     {
+      "epoch": 0.030355594102341718,
+      "grad_norm": 0.07596682757139206,
+      "learning_rate": 6.03448275862069e-05,
+      "loss": 0.0043,
       "step": 35
     },
     {
+      "epoch": 0.03469210754553339,
+      "grad_norm": 0.06741539388895035,
+      "learning_rate": 6.896551724137931e-05,
+      "loss": 0.0031,
       "step": 40
     },
     {
+      "epoch": 0.03902862098872507,
+      "grad_norm": 0.020168501883745193,
+      "learning_rate": 7.758620689655173e-05,
+      "loss": 0.0015,
       "step": 45
     },
     {
+      "epoch": 0.04336513443191674,
+      "grad_norm": 0.037137292325496674,
+      "learning_rate": 8.620689655172413e-05,
+      "loss": 0.0015,
       "step": 50
     },
     {
+      "epoch": 0.047701647875108416,
+      "grad_norm": 0.13907165825366974,
+      "learning_rate": 9.482758620689656e-05,
+      "loss": 0.0009,
       "step": 55
     },
     {
+      "epoch": 0.05203816131830009,
+      "grad_norm": 0.0042100874707102776,
+      "learning_rate": 0.00010344827586206898,
+      "loss": 0.0013,
       "step": 60
     },
     {
+      "epoch": 0.05637467476149176,
+      "grad_norm": 0.10066337138414383,
+      "learning_rate": 0.00011206896551724138,
+      "loss": 0.0011,
       "step": 65
     },
     {
+      "epoch": 0.060711188204683436,
+      "grad_norm": 0.0019178424263373017,
+      "learning_rate": 0.0001206896551724138,
+      "loss": 0.0021,
       "step": 70
     },
     {
+      "epoch": 0.06504770164787511,
+      "grad_norm": 0.09226825088262558,
+      "learning_rate": 0.0001293103448275862,
+      "loss": 0.001,
       "step": 75
     },
     {
+      "epoch": 0.06938421509106678,
+      "grad_norm": 0.0025779225397855043,
+      "learning_rate": 0.00013793103448275863,
+      "loss": 0.0004,
       "step": 80
     },
     {
+      "epoch": 0.07372072853425846,
+      "grad_norm": 0.0068855443969368935,
+      "learning_rate": 0.00014655172413793104,
+      "loss": 0.0004,
       "step": 85
     },
     {
+      "epoch": 0.07805724197745013,
+      "grad_norm": 0.0020994944497942924,
+      "learning_rate": 0.00015517241379310346,
+      "loss": 0.0005,
       "step": 90
     },
     {
+      "epoch": 0.0823937554206418,
+      "grad_norm": 0.002034812932834029,
+      "learning_rate": 0.00016379310344827587,
+      "loss": 0.0003,
       "step": 95
     },
     {
+      "epoch": 0.08673026886383348,
+      "grad_norm": 0.0018760113744065166,
+      "learning_rate": 0.00017241379310344826,
+      "loss": 0.0025,
       "step": 100
     },
     {
+      "epoch": 0.09106678230702515,
+      "grad_norm": 0.0013358393916860223,
+      "learning_rate": 0.0001810344827586207,
+      "loss": 0.0004,
       "step": 105
     },
     {
+      "epoch": 0.09540329575021683,
+      "grad_norm": 0.01223821472376585,
+      "learning_rate": 0.00018965517241379312,
+      "loss": 0.0013,
       "step": 110
     },
     {
+      "epoch": 0.0997398091934085,
+      "grad_norm": 0.010651814751327038,
+      "learning_rate": 0.00019827586206896554,
+      "loss": 0.0011,
       "step": 115
     },
     {
+      "epoch": 0.10407632263660017,
+      "grad_norm": 0.0021788859739899635,
+      "learning_rate": 0.0001999926577882564,
+      "loss": 0.0003,
       "step": 120
     },
     {
+      "epoch": 0.10841283607979185,
+      "grad_norm": 0.006896303500980139,
+      "learning_rate": 0.00019996283190086388,
+      "loss": 0.0003,
       "step": 125
     },
     {
+      "epoch": 0.11274934952298352,
+      "grad_norm": 0.003941075410693884,
+      "learning_rate": 0.00019991007028765122,
+      "loss": 0.0004,
       "step": 130
     },
     {
+      "epoch": 0.1170858629661752,
+      "grad_norm": 0.0025480242911726236,
+      "learning_rate": 0.0001998343850543768,
+      "loss": 0.0004,
       "step": 135
     },
     {
+      "epoch": 0.12142237640936687,
+      "grad_norm": 0.0022759223356842995,
+      "learning_rate": 0.0001997357935664527,
+      "loss": 0.0017,
       "step": 140
     },
     {
+      "epoch": 0.12575888985255854,
+      "grad_norm": 0.004669133573770523,
+      "learning_rate": 0.00019961431844496002,
+      "loss": 0.0005,
       "step": 145
     },
     {
+      "epoch": 0.13009540329575023,
+      "grad_norm": 0.0009343024576082826,
+      "learning_rate": 0.0001994699875614589,
+      "loss": 0.0003,
       "step": 150
     },
     {
+      "epoch": 0.1344319167389419,
+      "grad_norm": 0.0016963341040536761,
+      "learning_rate": 0.00019930283403159357,
+      "loss": 0.0002,
       "step": 155
     },
     {
+      "epoch": 0.13876843018213356,
+      "grad_norm": 0.003908711485564709,
+      "learning_rate": 0.000199112896207494,
+      "loss": 0.0002,
       "step": 160
     },
     {
+      "epoch": 0.14310494362532525,
+      "grad_norm": 0.001237453892827034,
+      "learning_rate": 0.00019890021766897663,
+      "loss": 0.0003,
       "step": 165
     },
     {
+      "epoch": 0.1474414570685169,
+      "grad_norm": 0.0009205214446410537,
+      "learning_rate": 0.00019866484721354499,
+      "loss": 0.0002,
       "step": 170
     },
     {
+      "epoch": 0.15177797051170858,
+      "grad_norm": 0.0009426427423022687,
+      "learning_rate": 0.00019840683884519368,
+      "loss": 0.0009,
       "step": 175
     },
     {
+      "epoch": 0.15611448395490027,
+      "grad_norm": 0.005143130198121071,
+      "learning_rate": 0.00019812625176201745,
+      "loss": 0.0002,
       "step": 180
     },
     {
+      "epoch": 0.16045099739809193,
+      "grad_norm": 0.0014524105936288834,
+      "learning_rate": 0.0001978231503426287,
+      "loss": 0.0002,
       "step": 185
     },
     {
+      "epoch": 0.1647875108412836,
+      "grad_norm": 0.06288997083902359,
+      "learning_rate": 0.00019749760413138626,
+      "loss": 0.0007,
       "step": 190
     },
     {
+      "epoch": 0.1691240242844753,
+      "grad_norm": 0.0016669128090143204,
+      "learning_rate": 0.0001971496878224389,
+      "loss": 0.0014,
       "step": 195
     },
     {
+      "epoch": 0.17346053772766695,
+      "grad_norm": 0.001623625634238124,
+      "learning_rate": 0.00019677948124258748,
+      "loss": 0.0002,
       "step": 200
     },
     {
+      "epoch": 0.17779705117085862,
+      "grad_norm": 0.000822431524284184,
+      "learning_rate": 0.00019638706933296915,
+      "loss": 0.0001,
       "step": 205
     },
     {
+      "epoch": 0.1821335646140503,
+      "grad_norm": 0.004115803632885218,
+      "learning_rate": 0.00019597254212956822,
+      "loss": 0.0004,
       "step": 210
     },
     {
+      "epoch": 0.18647007805724197,
+      "grad_norm": 0.0006049461662769318,
+      "learning_rate": 0.00019553599474255812,
+      "loss": 0.0001,
       "step": 215
     },
     {
+      "epoch": 0.19080659150043366,
+      "grad_norm": 0.0010940938955172896,
+      "learning_rate": 0.0001950775273344792,
+      "loss": 0.0002,
       "step": 220
     },
     {
+      "epoch": 0.19514310494362533,
+      "grad_norm": 0.013846734538674355,
+      "learning_rate": 0.00019459724509725694,
+      "loss": 0.0002,
       "step": 225
     },
     {
+      "epoch": 0.199479618386817,
+      "grad_norm": 0.0004897746257483959,
+      "learning_rate": 0.00019409525822806662,
+      "loss": 0.0001,
       "step": 230
     },
     {
+      "epoch": 0.20381613183000868,
+      "grad_norm": 0.0004935916513204575,
+      "learning_rate": 0.00019357168190404936,
+      "loss": 0.0001,
       "step": 235
     },
     {
+      "epoch": 0.20815264527320035,
+      "grad_norm": 0.00045111175859346986,
+      "learning_rate": 0.00019302663625588563,
+      "loss": 0.0001,
       "step": 240
     },
     {
+      "epoch": 0.212489158716392,
+      "grad_norm": 0.0004669725021813065,
+      "learning_rate": 0.00019246024634023208,
+      "loss": 0.0001,
       "step": 245
     },
     {
+      "epoch": 0.2168256721595837,
+      "grad_norm": 0.0005562947480939329,
+      "learning_rate": 0.0001918726421110282,
+      "loss": 0.0001,
       "step": 250
     },
     {
+      "epoch": 0.22116218560277537,
+      "grad_norm": 0.0005943718133494258,
+      "learning_rate": 0.00019126395838967941,
+      "loss": 0.0001,
       "step": 255
     },
     {
+      "epoch": 0.22549869904596703,
+      "grad_norm": 0.000462163268821314,
+      "learning_rate": 0.00019063433483412347,
+      "loss": 0.0001,
       "step": 260
     },
     {
+      "epoch": 0.22983521248915872,
+      "grad_norm": 0.000997440074570477,
+      "learning_rate": 0.00018998391590678665,
+      "loss": 0.0001,
       "step": 265
     },
     {
+      "epoch": 0.2341717259323504,
+      "grad_norm": 0.00212860107421875,
+      "learning_rate": 0.00018931285084143818,
+      "loss": 0.0002,
       "step": 270
     },
     {
+      "epoch": 0.23850823937554205,
+      "grad_norm": 0.0011264905333518982,
+      "learning_rate": 0.00018862129360894958,
+      "loss": 0.0001,
       "step": 275
     },
     {
+      "epoch": 0.24284475281873374,
+      "grad_norm": 0.0006431519868783653,
+      "learning_rate": 0.00018790940288196715,
+      "loss": 0.0001,
       "step": 280
     },
     {
+      "epoch": 0.2471812662619254,
+      "grad_norm": 0.000361749145668,
+      "learning_rate": 0.00018717734199850584,
+      "loss": 0.0001,
       "step": 285
     },
     {
+      "epoch": 0.2515177797051171,
+      "grad_norm": 0.0005252111586742103,
+      "learning_rate": 0.00018642527892447243,
+      "loss": 0.0001,
       "step": 290
     },
     {
+      "epoch": 0.25585429314830876,
+      "grad_norm": 0.000348928413586691,
+      "learning_rate": 0.0001856533862151271,
+      "loss": 0.0001,
       "step": 295
     },
     {
+      "epoch": 0.26019080659150046,
+      "grad_norm": 0.00145808607339859,
+      "learning_rate": 0.00018486184097549186,
+      "loss": 0.0001,
       "step": 300
     },
     {
+      "epoch": 0.2645273200346921,
+      "grad_norm": 0.0007445367518812418,
+      "learning_rate": 0.00018405082481971516,
+      "loss": 0.0004,
       "step": 305
     },
     {
+      "epoch": 0.2688638334778838,
+      "grad_norm": 0.00031382605084218085,
+      "learning_rate": 0.0001832205238294018,
+      "loss": 0.0001,
       "step": 310
     },
     {
+      "epoch": 0.2732003469210755,
+      "grad_norm": 0.0006055298144929111,
+      "learning_rate": 0.00018237112851091802,
+      "loss": 0.0001,
       "step": 315
     },
     {
+      "epoch": 0.2775368603642671,
+      "grad_norm": 0.0002734568843152374,
+      "learning_rate": 0.00018150283375168114,
+      "loss": 0.0001,
       "step": 320
     },
     {
+      "epoch": 0.2818733738074588,
+      "grad_norm": 0.000268082192633301,
+      "learning_rate": 0.00018061583877544414,
+      "loss": 0.0001,
       "step": 325
     },
     {
+      "epoch": 0.2862098872506505,
+      "grad_norm": 0.0006183464429341257,
+      "learning_rate": 0.0001797103470965852,
+      "loss": 0.0001,
       "step": 330
     },
     {
+      "epoch": 0.29054640069384213,
+      "grad_norm": 0.0012671047588810325,
+      "learning_rate": 0.000178786566473413,
+      "loss": 0.0001,
       "step": 335
     },
     {
+      "epoch": 0.2948829141370338,
+      "grad_norm": 0.00037384298047982156,
+      "learning_rate": 0.00017784470886049783,
+      "loss": 0.0001,
       "step": 340
     },
     {
+      "epoch": 0.2992194275802255,
+      "grad_norm": 0.00038407245301641524,
+      "learning_rate": 0.0001768849903600406,
+      "loss": 0.0001,
       "step": 345
     },
     {
+      "epoch": 0.30355594102341715,
+      "grad_norm": 0.0005591203807853162,
+      "learning_rate": 0.00017590763117228934,
+      "loss": 0.0001,
       "step": 350
     },
     {
+      "epoch": 0.30789245446660884,
+      "grad_norm": 0.0004962561069987714,
+      "learning_rate": 0.00017491285554501636,
+      "loss": 0.0001,
       "step": 355
     },
     {
+      "epoch": 0.31222896790980054,
+      "grad_norm": 0.00041289450018666685,
+      "learning_rate": 0.00017390089172206592,
+      "loss": 0.0002,
       "step": 360
     },
     {
+      "epoch": 0.3165654813529922,
+      "grad_norm": 0.0003575518203433603,
+      "learning_rate": 0.00017287197189098556,
+      "loss": 0.0001,
       "step": 365
     },
     {
+      "epoch": 0.32090199479618386,
+      "grad_norm": 0.00035653176018968225,
+      "learning_rate": 0.0001718263321297523,
+      "loss": 0.0,
       "step": 370
     },
+    {
+      "epoch": 0.32523850823937556,
+      "grad_norm": 0.00047867343528196216,
+      "learning_rate": 0.00017076421235260648,
+      "loss": 0.0001,
+      "step": 375
+    },
+    {
+      "epoch": 0.3295750216825672,
+      "grad_norm": 0.0018968080403283238,
+      "learning_rate": 0.00016968585625500498,
+      "loss": 0.0003,
+      "step": 380
+    },
+    {
+      "epoch": 0.3339115351257589,
+      "grad_norm": 0.0008598743006587029,
+      "learning_rate": 0.00016859151125770737,
+      "loss": 0.0002,
+      "step": 385
+    },
+    {
+      "epoch": 0.3382480485689506,
+      "grad_norm": 0.001001562224701047,
+      "learning_rate": 0.0001674814284500068,
+      "loss": 0.0001,
+      "step": 390
+    },
+    {
+      "epoch": 0.3425845620121422,
+      "grad_norm": 0.0005547697655856609,
+      "learning_rate": 0.00016635586253211962,
+      "loss": 0.0001,
+      "step": 395
+    },
+    {
+      "epoch": 0.3469210754553339,
+      "grad_norm": 0.0012839736882597208,
+      "learning_rate": 0.00016521507175674643,
+      "loss": 0.0,
+      "step": 400
+    },
+    {
+      "epoch": 0.3512575888985256,
+      "grad_norm": 0.00024862270220182836,
+      "learning_rate": 0.00016405931786981755,
+      "loss": 0.0001,
+      "step": 405
+    },
+    {
+      "epoch": 0.35559410234171723,
+      "grad_norm": 0.0018100353190675378,
+      "learning_rate": 0.00016288886605043764,
+      "loss": 0.0002,
+      "step": 410
+    },
+    {
+      "epoch": 0.3599306157849089,
+      "grad_norm": 0.0014100705739110708,
+      "learning_rate": 0.0001617039848500424,
+      "loss": 0.0001,
+      "step": 415
+    },
+    {
+      "epoch": 0.3642671292281006,
+      "grad_norm": 0.5226932764053345,
+      "learning_rate": 0.0001605049461307812,
+      "loss": 0.0009,
+      "step": 420
+    },
+    {
+      "epoch": 0.3686036426712923,
+      "grad_norm": 0.0063658421859145164,
+      "learning_rate": 0.00015929202500314051,
+      "loss": 0.0008,
+      "step": 425
+    },
+    {
+      "epoch": 0.37294015611448394,
+      "grad_norm": 0.0008500253898091614,
+      "learning_rate": 0.00015806549976282182,
+      "loss": 0.0006,
+      "step": 430
+    },
+    {
+      "epoch": 0.37727666955767564,
+      "grad_norm": 0.00786112155765295,
+      "learning_rate": 0.0001568256518268887,
+      "loss": 0.0002,
+      "step": 435
+    },
+    {
+      "epoch": 0.38161318300086733,
+      "grad_norm": 0.0010554317850619555,
+      "learning_rate": 0.00015557276566919784,
+      "loss": 0.0009,
+      "step": 440
+    },
+    {
+      "epoch": 0.38594969644405897,
+      "grad_norm": 0.08147475868463516,
+      "learning_rate": 0.0001543071287551287,
+      "loss": 0.0005,
+      "step": 445
+    },
+    {
+      "epoch": 0.39028620988725066,
+      "grad_norm": 0.00513398926705122,
+      "learning_rate": 0.0001530290314756265,
+      "loss": 0.0005,
+      "step": 450
+    },
+    {
+      "epoch": 0.39462272333044235,
+      "grad_norm": 0.012065605260431767,
+      "learning_rate": 0.00015173876708057438,
+      "loss": 0.0001,
+      "step": 455
+    },
+    {
+      "epoch": 0.398959236773634,
+      "grad_norm": 0.0004873398575000465,
+      "learning_rate": 0.00015043663161150937,
+      "loss": 0.0011,
+      "step": 460
+    },
+    {
+      "epoch": 0.4032957502168257,
+      "grad_norm": 0.006046785041689873,
+      "learning_rate": 0.00014912292383369787,
+      "loss": 0.0001,
+      "step": 465
+    },
+    {
+      "epoch": 0.40763226366001737,
+      "grad_norm": 0.0011943348217755556,
+      "learning_rate": 0.0001477979451675861,
+      "loss": 0.0001,
+      "step": 470
+    },
+    {
+      "epoch": 0.411968777103209,
+      "grad_norm": 0.000647057022433728,
+      "learning_rate": 0.0001464619996196415,
+      "loss": 0.0004,
+      "step": 475
+    },
+    {
+      "epoch": 0.4163052905464007,
+      "grad_norm": 0.04003003239631653,
+      "learning_rate": 0.00014511539371260074,
+      "loss": 0.0005,
+      "step": 480
+    },
+    {
+      "epoch": 0.4206418039895924,
+      "grad_norm": 0.02914806827902794,
+      "learning_rate": 0.00014375843641514034,
+      "loss": 0.0005,
+      "step": 485
+    },
+    {
+      "epoch": 0.424978317432784,
+      "grad_norm": 0.0006412832881323993,
+      "learning_rate": 0.0001423914390709861,
+      "loss": 0.001,
+      "step": 490
+    },
+    {
+      "epoch": 0.4293148308759757,
+      "grad_norm": 0.0005903305718675256,
+      "learning_rate": 0.00014101471532747752,
+      "loss": 0.0011,
+      "step": 495
+    },
+    {
+      "epoch": 0.4336513443191674,
+      "grad_norm": 0.0005716175073757768,
+      "learning_rate": 0.00013962858106360398,
+      "loss": 0.0012,
+      "step": 500
+    },
+    {
+      "epoch": 0.43798785776235905,
+      "grad_norm": 0.031988050788640976,
+      "learning_rate": 0.00013823335431752853,
+      "loss": 0.001,
+      "step": 505
+    },
+    {
+      "epoch": 0.44232437120555074,
+      "grad_norm": 0.003730722237378359,
+      "learning_rate": 0.00013682935521361627,
+      "loss": 0.0001,
+      "step": 510
+    },
+    {
+      "epoch": 0.44666088464874243,
+      "grad_norm": 0.0007267138571478426,
+      "learning_rate": 0.00013541690588898448,
+      "loss": 0.0001,
+      "step": 515
+    },
+    {
+      "epoch": 0.45099739809193407,
+      "grad_norm": 0.07632599025964737,
+      "learning_rate": 0.00013399633041959047,
+      "loss": 0.0003,
+      "step": 520
+    },
+    {
+      "epoch": 0.45533391153512576,
+      "grad_norm": 0.0006175984744913876,
+      "learning_rate": 0.00013256795474587485,
+      "loss": 0.0001,
+      "step": 525
+    },
+    {
+      "epoch": 0.45967042497831745,
+      "grad_norm": 0.0003872651723213494,
+      "learning_rate": 0.00013113210659797687,
+      "loss": 0.0001,
+      "step": 530
+    },
+    {
+      "epoch": 0.4640069384215091,
+      "grad_norm": 0.0009627993567846715,
+      "learning_rate": 0.00012968911542053923,
+      "loss": 0.0001,
+      "step": 535
+    },
+    {
+      "epoch": 0.4683434518647008,
+      "grad_norm": 0.0004405930812936276,
+      "learning_rate": 0.00012823931229711944,
+      "loss": 0.0001,
+      "step": 540
+    },
+    {
+      "epoch": 0.47267996530789247,
+      "grad_norm": 0.001180918887257576,
+      "learning_rate": 0.00012678302987422532,
+      "loss": 0.0001,
+      "step": 545
+    },
+    {
+      "epoch": 0.4770164787510841,
+      "grad_norm": 0.0004968827124685049,
+      "learning_rate": 0.00012532060228499136,
+      "loss": 0.0001,
+      "step": 550
+    },
+    {
+      "epoch": 0.4813529921942758,
+      "grad_norm": 0.0003979886241722852,
+      "learning_rate": 0.00012385236507251476,
+      "loss": 0.0001,
+      "step": 555
+    },
+    {
+      "epoch": 0.4856895056374675,
+      "grad_norm": 0.010322037152945995,
+      "learning_rate": 0.00012237865511286746,
+      "loss": 0.0001,
+      "step": 560
+    },
+    {
+      "epoch": 0.4900260190806591,
+      "grad_norm": 0.0002791626611724496,
+      "learning_rate": 0.00012089981053780224,
+      "loss": 0.0,
+      "step": 565
+    },
+    {
+      "epoch": 0.4943625325238508,
+      "grad_norm": 0.000490942969918251,
+      "learning_rate": 0.00011941617065717124,
+      "loss": 0.0001,
+      "step": 570
+    },
+    {
+      "epoch": 0.4986990459670425,
+      "grad_norm": 0.0003438853018451482,
+      "learning_rate": 0.00011792807588107357,
+      "loss": 0.0001,
+      "step": 575
+    },
+    {
+      "epoch": 0.5030355594102341,
+      "grad_norm": 0.0021341927349567413,
+      "learning_rate": 0.00011643586764175092,
+      "loss": 0.0001,
+      "step": 580
+    },
+    {
+      "epoch": 0.5073720728534259,
+      "grad_norm": 0.00031035722349770367,
+      "learning_rate": 0.00011493988831524834,
+      "loss": 0.0,
+      "step": 585
+    },
+    {
+      "epoch": 0.5117085862966175,
+      "grad_norm": 0.00061289023142308,
+      "learning_rate": 0.00011344048114285882,
+      "loss": 0.0,
+      "step": 590
+    },
+    {
+      "epoch": 0.5160450997398092,
+      "grad_norm": 0.00996777880936861,
+      "learning_rate": 0.00011193799015236885,
+      "loss": 0.0001,
+      "step": 595
+    },
+    {
+      "epoch": 0.5203816131830009,
+      "grad_norm": 0.0004861672641709447,
+      "learning_rate": 0.00011043276007912413,
+      "loss": 0.0001,
+      "step": 600
+    },
+    {
+      "epoch": 0.5247181266261925,
+      "grad_norm": 0.0003749791067093611,
+      "learning_rate": 0.00010892513628693237,
+      "loss": 0.0,
+      "step": 605
+    },
+    {
+      "epoch": 0.5290546400693842,
+      "grad_norm": 0.00017323833890259266,
+      "learning_rate": 0.00010741546468882223,
+      "loss": 0.0,
+      "step": 610
+    },
+    {
+      "epoch": 0.5333911535125759,
+      "grad_norm": 0.0002214506093878299,
+      "learning_rate": 0.00010590409166767614,
+      "loss": 0.0,
+      "step": 615
+    },
+    {
+      "epoch": 0.5377276669557676,
+      "grad_norm": 0.00023450802837032825,
+      "learning_rate": 0.00010439136399675542,
+      "loss": 0.0,
+      "step": 620
+    },
+    {
+      "epoch": 0.5420641803989592,
+      "grad_norm": 0.00017662130994722247,
+      "learning_rate": 0.00010287762876013563,
+      "loss": 0.0,
+      "step": 625
+    },
+    {
+      "epoch": 0.546400693842151,
+      "grad_norm": 0.00017046746506821364,
+      "learning_rate": 0.00010136323327307075,
+      "loss": 0.0,
+      "step": 630
+    },
+    {
+      "epoch": 0.5507372072853426,
+      "grad_norm": 0.00033422402339056134,
+      "learning_rate": 9.984852500230432e-05,
+      "loss": 0.0,
+      "step": 635
+    },
+    {
+      "epoch": 0.5550737207285342,
+      "grad_norm": 0.0006649333517998457,
+      "learning_rate": 9.833385148634574e-05,
+      "loss": 0.0001,
+      "step": 640
+    },
+    {
+      "epoch": 0.559410234171726,
+      "grad_norm": 0.0009602860664017498,
+      "learning_rate": 9.681956025573022e-05,
+      "loss": 0.0,
+      "step": 645
+    },
+    {
+      "epoch": 0.5637467476149176,
+      "grad_norm": 0.000601974839810282,
+      "learning_rate": 9.53059987532804e-05,
+      "loss": 0.0,
+      "step": 650
+    },
+    {
+      "epoch": 0.5680832610581092,
+      "grad_norm": 0.0002987553598359227,
+      "learning_rate": 9.379351425438826e-05,
+      "loss": 0.0,
+      "step": 655
+    },
+    {
+      "epoch": 0.572419774501301,
+      "grad_norm": 0.00017630436923354864,
+      "learning_rate": 9.228245378733537e-05,
+      "loss": 0.0,
+      "step": 660
+    },
+    {
+      "epoch": 0.5767562879444926,
+      "grad_norm": 0.00021065869077574462,
+      "learning_rate": 9.077316405366981e-05,
+      "loss": 0.0,
+      "step": 665
+    },
+    {
+      "epoch": 0.5810928013876843,
+      "grad_norm": 0.0003561481134966016,
+      "learning_rate": 8.926599134865808e-05,
+      "loss": 0.0,
+      "step": 670
+    },
+    {
+      "epoch": 0.585429314830876,
+      "grad_norm": 0.000257289269939065,
+      "learning_rate": 8.776128148183028e-05,
+      "loss": 0.0,
+      "step": 675
+    },
+    {
+      "epoch": 0.5897658282740676,
+      "grad_norm": 0.00019555074686650187,
+      "learning_rate": 8.625937969763662e-05,
+      "loss": 0.0,
+      "step": 680
+    },
+    {
+      "epoch": 0.5941023417172593,
+      "grad_norm": 0.00053867616225034,
+      "learning_rate": 8.476063059623375e-05,
+      "loss": 0.0,
+      "step": 685
+    },
+    {
+      "epoch": 0.598438855160451,
+      "grad_norm": 0.0004365757922641933,
+      "learning_rate": 8.326537805441884e-05,
+      "loss": 0.0,
+      "step": 690
+    },
+    {
+      "epoch": 0.6027753686036427,
+      "grad_norm": 0.0002754295419435948,
+      "learning_rate": 8.177396514672939e-05,
+      "loss": 0.0,
+      "step": 695
+    },
+    {
+      "epoch": 0.6071118820468343,
+      "grad_norm": 0.0001531521265860647,
+      "learning_rate": 8.028673406672763e-05,
+      "loss": 0.0001,
+      "step": 700
+    },
+    {
+      "epoch": 0.611448395490026,
+      "grad_norm": 0.0001671235222602263,
+      "learning_rate": 7.880402604848662e-05,
+      "loss": 0.0,
+      "step": 705
+    },
+    {
+      "epoch": 0.6157849089332177,
+      "grad_norm": 0.0001691756333457306,
+      "learning_rate": 7.732618128829656e-05,
+      "loss": 0.0,
+      "step": 710
+    },
+    {
+      "epoch": 0.6201214223764093,
+      "grad_norm": 0.000139987314469181,
+      "learning_rate": 7.585353886660954e-05,
+      "loss": 0.0,
+      "step": 715
+    },
+    {
+      "epoch": 0.6244579358196011,
+      "grad_norm": 0.00023128537577576935,
+      "learning_rate": 7.438643667023979e-05,
+      "loss": 0.0,
+      "step": 720
+    },
+    {
+      "epoch": 0.6287944492627927,
+      "grad_norm": 0.0001494650059612468,
+      "learning_rate": 7.292521131483821e-05,
+      "loss": 0.0,
+      "step": 725
+    },
+    {
+      "epoch": 0.6331309627059843,
+      "grad_norm": 0.0011407610727474093,
+      "learning_rate": 7.147019806765836e-05,
+      "loss": 0.0,
+      "step": 730
+    },
+    {
+      "epoch": 0.6374674761491761,
+      "grad_norm": 0.00017924026178661734,
+      "learning_rate": 7.002173077063197e-05,
+      "loss": 0.0,
+      "step": 735
+    },
+    {
+      "epoch": 0.6418039895923677,
+      "grad_norm": 0.00017581222346052527,
+      "learning_rate": 6.858014176377139e-05,
+      "loss": 0.0,
+      "step": 740
+    },
+    {
+      "epoch": 0.6461405030355594,
+      "grad_norm": 0.0001402726920787245,
+      "learning_rate": 6.714576180891654e-05,
+      "loss": 0.0,
+      "step": 745
+    },
+    {
+      "epoch": 0.6504770164787511,
+      "grad_norm": 0.00018241746874991804,
+      "learning_rate": 6.57189200138442e-05,
+      "loss": 0.0,
+      "step": 750
+    },
+    {
+      "epoch": 0.6548135299219427,
+      "grad_norm": 0.0002422356337774545,
+      "learning_rate": 6.429994375675661e-05,
+      "loss": 0.0001,
+      "step": 755
+    },
+    {
+      "epoch": 0.6591500433651344,
+      "grad_norm": 0.000627288012765348,
+      "learning_rate": 6.288915861116706e-05,
+      "loss": 0.0,
+      "step": 760
+    },
+    {
+      "epoch": 0.6634865568083261,
+      "grad_norm": 0.0002638279111124575,
+      "learning_rate": 6.148688827119937e-05,
+      "loss": 0.0,
+      "step": 765
+    },
+    {
+      "epoch": 0.6678230702515178,
+      "grad_norm": 0.001158065744675696,
+      "learning_rate": 6.009345447731886e-05,
+      "loss": 0.0,
+      "step": 770
+    },
+    {
+      "epoch": 0.6721595836947094,
+      "grad_norm": 0.00033756031189113855,
+      "learning_rate": 5.8709176942511136e-05,
+      "loss": 0.0,
+      "step": 775
+    },
+    {
+      "epoch": 0.6764960971379012,
+      "grad_norm": 0.000685289385728538,
+      "learning_rate": 5.733437327892661e-05,
+      "loss": 0.0,
+      "step": 780
+    },
+    {
+      "epoch": 0.6808326105810928,
+      "grad_norm": 0.0002907447633333504,
+      "learning_rate": 5.596935892500663e-05,
+      "loss": 0.0,
+      "step": 785
+    },
+    {
+      "epoch": 0.6851691240242844,
+      "grad_norm": 0.00017344890511594713,
+      "learning_rate": 5.4614447073108375e-05,
+      "loss": 0.0001,
+      "step": 790
+    },
+    {
+      "epoch": 0.6895056374674762,
+      "grad_norm": 0.00025545316748321056,
+      "learning_rate": 5.326994859764552e-05,
+      "loss": 0.0,
+      "step": 795
+    },
+    {
+      "epoch": 0.6938421509106678,
+      "grad_norm": 0.000984342535957694,
+      "learning_rate": 5.193617198376004e-05,
+      "loss": 0.0001,
+      "step": 800
+    },
+    {
+      "epoch": 0.6981786643538594,
+      "grad_norm": 0.0002541212597861886,
+      "learning_rate": 5.0613423256542904e-05,
+      "loss": 0.0,
+      "step": 805
+    },
+    {
+      "epoch": 0.7025151777970512,
+      "grad_norm": 0.005005079321563244,
+      "learning_rate": 4.930200591081865e-05,
+      "loss": 0.0,
+      "step": 810
+    },
+    {
+      "epoch": 0.7068516912402428,
+      "grad_norm": 0.0036286672111600637,
+      "learning_rate": 4.8002220841511045e-05,
+      "loss": 0.0,
+      "step": 815
+    },
+    {
+      "epoch": 0.7111882046834345,
+      "grad_norm": 0.0012679731007665396,
+      "learning_rate": 4.671436627460479e-05,
+      "loss": 0.0,
+      "step": 820
+    },
+    {
+      "epoch": 0.7155247181266262,
+      "grad_norm": 0.00015154715219978243,
+      "learning_rate": 4.543873769871978e-05,
+      "loss": 0.0,
+      "step": 825
+    },
+    {
+      "epoch": 0.7198612315698178,
+      "grad_norm": 0.00019088915723841637,
+      "learning_rate": 4.417562779731355e-05,
+      "loss": 0.0,
+      "step": 830
+    },
+    {
+      "epoch": 0.7241977450130095,
+      "grad_norm": 0.000255732040386647,
+      "learning_rate": 4.292532638152713e-05,
+      "loss": 0.0001,
+      "step": 835
+    },
+    {
+      "epoch": 0.7285342584562012,
+      "grad_norm": 0.00032757947337813675,
+      "learning_rate": 4.168812032369026e-05,
+      "loss": 0.0,
+      "step": 840
+    },
+    {
+      "epoch": 0.7328707718993929,
+      "grad_norm": 0.0005451737088151276,
+      "learning_rate": 4.0464293491500326e-05,
+      "loss": 0.0005,
+      "step": 845
+    },
+    {
+      "epoch": 0.7372072853425846,
+      "grad_norm": 0.3482416570186615,
+      "learning_rate": 3.9254126682891425e-05,
+      "loss": 0.0004,
+      "step": 850
+    },
+    {
+      "epoch": 0.7415437987857763,
+      "grad_norm": 0.00018617334717418998,
+      "learning_rate": 3.8057897561607014e-05,
+      "loss": 0.0,
+      "step": 855
+    },
+    {
+      "epoch": 0.7458803122289679,
+      "grad_norm": 0.0004157496150583029,
+      "learning_rate": 3.68758805934923e-05,
+      "loss": 0.0,
+      "step": 860
+    },
+    {
+      "epoch": 0.7502168256721596,
+      "grad_norm": 0.0003447196795605123,
+      "learning_rate": 3.5708346983519904e-05,
+      "loss": 0.0,
+      "step": 865
+    },
+    {
+      "epoch": 0.7545533391153513,
+      "grad_norm": 0.0001821848563849926,
+      "learning_rate": 3.455556461356413e-05,
+      "loss": 0.0,
+      "step": 870
+    },
+    {
+      "epoch": 0.7588898525585429,
+      "grad_norm": 0.0008997240802273154,
+      "learning_rate": 3.3417797980937305e-05,
+      "loss": 0.0,
+      "step": 875
+    },
+    {
+      "epoch": 0.7632263660017347,
+      "grad_norm": 0.00041676373803056777,
+      "learning_rate": 3.229530813770281e-05,
+      "loss": 0.0,
+      "step": 880
+    },
+    {
+      "epoch": 0.7675628794449263,
+      "grad_norm": 0.00024333845067303628,
+      "learning_rate": 3.118835263077874e-05,
+      "loss": 0.0,
+      "step": 885
+    },
+    {
+      "epoch": 0.7718993928881179,
+      "grad_norm": 0.00044431493734009564,
+      "learning_rate": 3.0097185442845653e-05,
+      "loss": 0.0,
+      "step": 890
+    },
+    {
+      "epoch": 0.7762359063313097,
+      "grad_norm": 0.003919024486094713,
+      "learning_rate": 2.9022056934072106e-05,
+      "loss": 0.0,
+      "step": 895
+    },
+    {
+      "epoch": 0.7805724197745013,
+      "grad_norm": 0.00021601624030154198,
+      "learning_rate": 2.796321378467146e-05,
+      "loss": 0.0001,
+      "step": 900
+    },
+    {
+      "epoch": 0.784908933217693,
+      "grad_norm": 0.0003809454501606524,
+      "learning_rate": 2.6920898938302885e-05,
+      "loss": 0.0,
+      "step": 905
+    },
+    {
+      "epoch": 0.7892454466608847,
+      "grad_norm": 0.0009574461146257818,
+      "learning_rate": 2.5895351546329717e-05,
+      "loss": 0.0,
+      "step": 910
+    },
+    {
+      "epoch": 0.7935819601040763,
+      "grad_norm": 0.0008413203759118915,
+      "learning_rate": 2.4886806912948035e-05,
+      "loss": 0.0,
+      "step": 915
+    },
+    {
+      "epoch": 0.797918473547268,
+      "grad_norm": 0.00016790963127277792,
+      "learning_rate": 2.3895496441197806e-05,
+      "loss": 0.0,
+      "step": 920
+    },
+    {
+      "epoch": 0.8022549869904597,
+      "grad_norm": 0.0002135594404535368,
+      "learning_rate": 2.2921647579869275e-05,
+      "loss": 0.0,
+      "step": 925
+    },
+    {
+      "epoch": 0.8065915004336514,
+      "grad_norm": 0.0006223685923032463,
+      "learning_rate": 2.1965483771316498e-05,
+      "loss": 0.0,
+      "step": 930
+    },
+    {
+      "epoch": 0.810928013876843,
+      "grad_norm": 0.0001733316748868674,
+      "learning_rate": 2.102722440019006e-05,
+      "loss": 0.0,
+      "step": 935
+    },
+    {
+      "epoch": 0.8152645273200347,
+      "grad_norm": 0.0006180764175951481,
+      "learning_rate": 2.0107084743101024e-05,
+      "loss": 0.0,
+      "step": 940
+    },
+    {
+      "epoch": 0.8196010407632264,
+      "grad_norm": 0.019121604040265083,
+      "learning_rate": 1.9205275919227282e-05,
+      "loss": 0.0,
+      "step": 945
+    },
+    {
+      "epoch": 0.823937554206418,
+      "grad_norm": 0.00031261073308996856,
+      "learning_rate": 1.8322004841873842e-05,
+      "loss": 0.0,
+      "step": 950
+    },
+    {
+      "epoch": 0.8282740676496098,
+      "grad_norm": 0.00025373531389050186,
+      "learning_rate": 1.7457474170998113e-05,
+      "loss": 0.0001,
+      "step": 955
+    },
+    {
+      "epoch": 0.8326105810928014,
+      "grad_norm": 0.00029701556195504963,
+      "learning_rate": 1.661188226671111e-05,
+      "loss": 0.0,
+      "step": 960
+    },
+    {
+      "epoch": 0.836947094535993,
+      "grad_norm": 0.00018009443010669202,
+      "learning_rate": 1.5785423143765143e-05,
+      "loss": 0.0,
+      "step": 965
+    },
+    {
+      "epoch": 0.8412836079791848,
+      "grad_norm": 0.00026098068337887526,
+      "learning_rate": 1.4978286427038601e-05,
+      "loss": 0.0,
+      "step": 970
+    },
+    {
+      "epoch": 0.8456201214223764,
+      "grad_norm": 0.00022298283874988556,
+      "learning_rate": 1.4190657308027989e-05,
+      "loss": 0.0,
+      "step": 975
+    },
+    {
+      "epoch": 0.849956634865568,
+      "grad_norm": 0.0003458283899817616,
+      "learning_rate": 1.3422716502357102e-05,
+      "loss": 0.0,
+      "step": 980
+    },
+    {
+      "epoch": 0.8542931483087598,
+      "grad_norm": 0.00045949083869345486,
+      "learning_rate": 1.2674640208313137e-05,
+      "loss": 0.0,
+      "step": 985
+    },
+    {
+      "epoch": 0.8586296617519514,
+      "grad_norm": 0.00020235533884260803,
+      "learning_rate": 1.1946600066419345e-05,
+      "loss": 0.0,
+      "step": 990
+    },
+    {
+      "epoch": 0.8629661751951431,
+      "grad_norm": 0.00016053752915468067,
+      "learning_rate": 1.1238763120053387e-05,
+      "loss": 0.0,
+      "step": 995
+    },
+    {
+      "epoch": 0.8673026886383348,
+      "grad_norm": 0.0002067391760647297,
+      "learning_rate": 1.0551291777120464e-05,
+      "loss": 0.0,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8716392020815265,
+      "grad_norm": 0.00024100695736706257,
+      "learning_rate": 9.884343772790005e-06,
+      "loss": 0.0,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8759757155247181,
+      "grad_norm": 0.00032670717337168753,
+      "learning_rate": 9.238072133304653e-06,
+      "loss": 0.0,
+      "step": 1010
+    },
+    {
+      "epoch": 0.8803122289679098,
+      "grad_norm": 0.0010708924382925034,
+      "learning_rate": 8.612625140869324e-06,
+      "loss": 0.0,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8846487424111015,
+      "grad_norm": 0.00016485978267155588,
+      "learning_rate": 8.00814629962916e-06,
+      "loss": 0.0,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8889852558542931,
+      "grad_norm": 0.0012277034111320972,
+      "learning_rate": 7.424774302743409e-06,
+      "loss": 0.0,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8933217692974849,
+      "grad_norm": 0.0001766424102243036,
+      "learning_rate": 6.862643000563407e-06,
+      "loss": 0.0,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8976582827406765,
+      "grad_norm": 0.00012875783431809396,
+      "learning_rate": 6.321881369921656e-06,
+      "loss": 0.0,
+      "step": 1035
+    },
+    {
+      "epoch": 0.9019947961838681,
+      "grad_norm": 0.0003258756478317082,
+      "learning_rate": 5.802613484538888e-06,
+      "loss": 0.0,
+      "step": 1040
+    },
+    {
+      "epoch": 0.9063313096270599,
+      "grad_norm": 0.0003074019914492965,
+      "learning_rate": 5.304958486556488e-06,
+      "loss": 0.0,
+      "step": 1045
+    },
+    {
+      "epoch": 0.9106678230702515,
+      "grad_norm": 0.00044591055484488606,
+      "learning_rate": 4.829030559200032e-06,
+      "loss": 0.0,
+      "step": 1050
+    },
+    {
+      "epoch": 0.9150043365134432,
+      "grad_norm": 0.00019902190251741558,
+      "learning_rate": 4.374938900580883e-06,
+      "loss": 0.0,
+      "step": 1055
+    },
+    {
+      "epoch": 0.9193408499566349,
+      "grad_norm": 0.0006033729878254235,
+      "learning_rate": 3.942787698641548e-06,
+      "loss": 0.0,
+      "step": 1060
+    },
+    {
+      "epoch": 0.9236773633998265,
+      "grad_norm": 0.00017164893506560475,
+      "learning_rate": 3.532676107250421e-06,
+      "loss": 0.0,
+      "step": 1065
+    },
+    {
+      "epoch": 0.9280138768430182,
+      "grad_norm": 0.0002156089904019609,
+      "learning_rate": 3.1446982234517474e-06,
+      "loss": 0.0,
+      "step": 1070
+    },
+    {
+      "epoch": 0.9323503902862099,
+      "grad_norm": 0.00040788273327052593,
+      "learning_rate": 2.7789430658757275e-06,
+      "loss": 0.0,
+      "step": 1075
+    },
+    {
+      "epoch": 0.9366869037294016,
+      "grad_norm": 0.0003488777147140354,
+      "learning_rate": 2.4354945543138775e-06,
+      "loss": 0.0,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9410234171725932,
+      "grad_norm": 0.0004901742213405669,
+      "learning_rate": 2.1144314904642195e-06,
+      "loss": 0.0,
+      "step": 1085
+    },
+    {
+      "epoch": 0.9453599306157849,
+      "grad_norm": 0.00025740411365404725,
+      "learning_rate": 1.8158275398508784e-06,
+      "loss": 0.0,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9496964440589766,
+      "grad_norm": 0.0002657132863532752,
+      "learning_rate": 1.539751214922014e-06,
+      "loss": 0.0,
+      "step": 1095
+    },
+    {
+      "epoch": 0.9540329575021682,
+      "grad_norm": 0.0013277851976454258,
+      "learning_rate": 1.2862658593302046e-06,
+      "loss": 0.0,
+      "step": 1100
+    },
+    {
+      "epoch": 0.95836947094536,
+      "grad_norm": 0.00018901302246376872,
+      "learning_rate": 1.0554296333987078e-06,
+      "loss": 0.0,
+      "step": 1105
+    },
+    {
+      "epoch": 0.9627059843885516,
+      "grad_norm": 0.00029082829132676125,
+      "learning_rate": 8.472955007769456e-07,
+      "loss": 0.0,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9670424978317432,
+      "grad_norm": 0.0001757108693709597,
+      "learning_rate": 6.619112162885022e-07,
+      "loss": 0.0,
+      "step": 1115
+    },
+    {
+      "epoch": 0.971379011274935,
+      "grad_norm": 0.00018936190463136882,
+      "learning_rate": 4.993193149740338e-07,
+      "loss": 0.0,
+      "step": 1120
+    },
+    {
+      "epoch": 0.9757155247181266,
+      "grad_norm": 0.00027079100254923105,
+      "learning_rate": 3.595571023319755e-07,
+      "loss": 0.0,
+      "step": 1125
+    },
+    {
+      "epoch": 0.9800520381613183,
+      "grad_norm": 0.0003670993319246918,
+      "learning_rate": 2.426566457590651e-07,
+      "loss": 0.0,
+      "step": 1130
+    },
+    {
+      "epoch": 0.98438855160451,
+      "grad_norm": 0.00011753277067327872,
+      "learning_rate": 1.4864476719270714e-07,
+      "loss": 0.0,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9887250650477016,
+      "grad_norm": 0.01616135612130165,
+      "learning_rate": 7.754303695688414e-08,
+      "loss": 0.0002,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9930615784908933,
+      "grad_norm": 0.0007944769458845258,
+      "learning_rate": 2.936776881302672e-08,
+      "loss": 0.0002,
+      "step": 1145
+    },
+    {
+      "epoch": 0.997398091934085,
+      "grad_norm": 0.00044876235187985003,
+      "learning_rate": 4.130016216896682e-09,
+      "loss": 0.0,
+      "step": 1150
+    },
     {
       "epoch": 1.0,
+      "step": 1153,
+      "total_flos": 1.025624031790891e+18,
+      "train_loss": 0.004042043934228078,
+      "train_runtime": 6129.3476,
+      "train_samples_per_second": 3.01,
+      "train_steps_per_second": 0.188
     }
   ],
   "logging_steps": 5,
+  "max_steps": 1153,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.025624031790891e+18,
+  "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null
 }