End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +2860 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: oh_scale_x4_compute_equal
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # oh_scale_x4_compute_equal
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7223

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: oh_scale_x4_compute_equal
 # oh_scale_x4_compute_equal
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/oh-dcft-v1.3_no-curation_gpt-4o-mini_scale_4x dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.7223

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 3.99849454271735,
+    "eval_loss": 0.7222821116447449,
+    "eval_runtime": 690.268,
+    "eval_samples_per_second": 38.896,
+    "eval_steps_per_second": 0.608,
+    "total_flos": 6673139006177280.0,
+    "train_loss": 0.6817261522194468,
+    "train_runtime": 130344.1738,
+    "train_samples_per_second": 15.654,
+    "train_steps_per_second": 0.031
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 3.99849454271735,
+    "eval_loss": 0.7222821116447449,
+    "eval_runtime": 690.268,
+    "eval_samples_per_second": 38.896,
+    "eval_steps_per_second": 0.608
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 3.99849454271735,
+    "total_flos": 6673139006177280.0,
+    "train_loss": 0.6817261522194468,
+    "train_runtime": 130344.1738,
+    "train_samples_per_second": 15.654,
+    "train_steps_per_second": 0.031
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2860 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.99849454271735,
+  "eval_steps": 500,
+  "global_step": 3984,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0100363818843307,
+      "grad_norm": 33.33474786117092,
+      "learning_rate": 5e-06,
+      "loss": 1.0592,
+      "step": 10
+    },
+    {
+      "epoch": 0.0200727637686614,
+      "grad_norm": 1.3624139034064995,
+      "learning_rate": 5e-06,
+      "loss": 0.9442,
+      "step": 20
+    },
+    {
+      "epoch": 0.030109145652992095,
+      "grad_norm": 1.4119904374590468,
+      "learning_rate": 5e-06,
+      "loss": 0.8939,
+      "step": 30
+    },
+    {
+      "epoch": 0.0401455275373228,
+      "grad_norm": 1.528739282183849,
+      "learning_rate": 5e-06,
+      "loss": 0.8733,
+      "step": 40
+    },
+    {
+      "epoch": 0.050181909421653494,
+      "grad_norm": 1.0731508202908417,
+      "learning_rate": 5e-06,
+      "loss": 0.8605,
+      "step": 50
+    },
+    {
+      "epoch": 0.06021829130598419,
+      "grad_norm": 0.9332984692607159,
+      "learning_rate": 5e-06,
+      "loss": 0.8442,
+      "step": 60
+    },
+    {
+      "epoch": 0.07025467319031489,
+      "grad_norm": 0.872377007810567,
+      "learning_rate": 5e-06,
+      "loss": 0.8326,
+      "step": 70
+    },
+    {
+      "epoch": 0.0802910550746456,
+      "grad_norm": 1.1369570969633511,
+      "learning_rate": 5e-06,
+      "loss": 0.8255,
+      "step": 80
+    },
+    {
+      "epoch": 0.09032743695897628,
+      "grad_norm": 1.2039587178247435,
+      "learning_rate": 5e-06,
+      "loss": 0.8131,
+      "step": 90
+    },
+    {
+      "epoch": 0.10036381884330699,
+      "grad_norm": 0.7786189445202707,
+      "learning_rate": 5e-06,
+      "loss": 0.8089,
+      "step": 100
+    },
+    {
+      "epoch": 0.11040020072763769,
+      "grad_norm": 0.8167899418786717,
+      "learning_rate": 5e-06,
+      "loss": 0.8015,
+      "step": 110
+    },
+    {
+      "epoch": 0.12043658261196838,
+      "grad_norm": 0.7700801877494686,
+      "learning_rate": 5e-06,
+      "loss": 0.7984,
+      "step": 120
+    },
+    {
+      "epoch": 0.13047296449629908,
+      "grad_norm": 0.7344316347793653,
+      "learning_rate": 5e-06,
+      "loss": 0.8001,
+      "step": 130
+    },
+    {
+      "epoch": 0.14050934638062978,
+      "grad_norm": 0.8770050061566579,
+      "learning_rate": 5e-06,
+      "loss": 0.7932,
+      "step": 140
+    },
+    {
+      "epoch": 0.1505457282649605,
+      "grad_norm": 0.996787607804233,
+      "learning_rate": 5e-06,
+      "loss": 0.7958,
+      "step": 150
+    },
+    {
+      "epoch": 0.1605821101492912,
+      "grad_norm": 0.6341583113229519,
+      "learning_rate": 5e-06,
+      "loss": 0.7933,
+      "step": 160
+    },
+    {
+      "epoch": 0.17061849203362187,
+      "grad_norm": 0.7057153333092503,
+      "learning_rate": 5e-06,
+      "loss": 0.7864,
+      "step": 170
+    },
+    {
+      "epoch": 0.18065487391795257,
+      "grad_norm": 0.5875164626639859,
+      "learning_rate": 5e-06,
+      "loss": 0.7864,
+      "step": 180
+    },
+    {
+      "epoch": 0.19069125580228327,
+      "grad_norm": 0.696758643929393,
+      "learning_rate": 5e-06,
+      "loss": 0.7851,
+      "step": 190
+    },
+    {
+      "epoch": 0.20072763768661397,
+      "grad_norm": 0.6720306069577169,
+      "learning_rate": 5e-06,
+      "loss": 0.782,
+      "step": 200
+    },
+    {
+      "epoch": 0.21076401957094468,
+      "grad_norm": 0.7205930987361605,
+      "learning_rate": 5e-06,
+      "loss": 0.7802,
+      "step": 210
+    },
+    {
+      "epoch": 0.22080040145527538,
+      "grad_norm": 0.7263170071853633,
+      "learning_rate": 5e-06,
+      "loss": 0.7766,
+      "step": 220
+    },
+    {
+      "epoch": 0.23083678333960608,
+      "grad_norm": 0.6222177030840054,
+      "learning_rate": 5e-06,
+      "loss": 0.7811,
+      "step": 230
+    },
+    {
+      "epoch": 0.24087316522393676,
+      "grad_norm": 0.7933389948657629,
+      "learning_rate": 5e-06,
+      "loss": 0.7785,
+      "step": 240
+    },
+    {
+      "epoch": 0.25090954710826746,
+      "grad_norm": 0.677033021877631,
+      "learning_rate": 5e-06,
+      "loss": 0.774,
+      "step": 250
+    },
+    {
+      "epoch": 0.26094592899259816,
+      "grad_norm": 1.1592201237448847,
+      "learning_rate": 5e-06,
+      "loss": 0.7685,
+      "step": 260
+    },
+    {
+      "epoch": 0.27098231087692887,
+      "grad_norm": 0.8219803131042634,
+      "learning_rate": 5e-06,
+      "loss": 0.7733,
+      "step": 270
+    },
+    {
+      "epoch": 0.28101869276125957,
+      "grad_norm": 0.8862071608668877,
+      "learning_rate": 5e-06,
+      "loss": 0.7717,
+      "step": 280
+    },
+    {
+      "epoch": 0.2910550746455903,
+      "grad_norm": 0.7936049147378009,
+      "learning_rate": 5e-06,
+      "loss": 0.7732,
+      "step": 290
+    },
+    {
+      "epoch": 0.301091456529921,
+      "grad_norm": 1.0061666072778264,
+      "learning_rate": 5e-06,
+      "loss": 0.7694,
+      "step": 300
+    },
+    {
+      "epoch": 0.3111278384142517,
+      "grad_norm": 0.8032266175827166,
+      "learning_rate": 5e-06,
+      "loss": 0.7667,
+      "step": 310
+    },
+    {
+      "epoch": 0.3211642202985824,
+      "grad_norm": 0.6353992902115431,
+      "learning_rate": 5e-06,
+      "loss": 0.7651,
+      "step": 320
+    },
+    {
+      "epoch": 0.3312006021829131,
+      "grad_norm": 0.771841795504432,
+      "learning_rate": 5e-06,
+      "loss": 0.7628,
+      "step": 330
+    },
+    {
+      "epoch": 0.34123698406724373,
+      "grad_norm": 0.6845658986967137,
+      "learning_rate": 5e-06,
+      "loss": 0.7717,
+      "step": 340
+    },
+    {
+      "epoch": 0.35127336595157443,
+      "grad_norm": 0.5670233457983912,
+      "learning_rate": 5e-06,
+      "loss": 0.7634,
+      "step": 350
+    },
+    {
+      "epoch": 0.36130974783590514,
+      "grad_norm": 0.6478342801580839,
+      "learning_rate": 5e-06,
+      "loss": 0.7629,
+      "step": 360
+    },
+    {
+      "epoch": 0.37134612972023584,
+      "grad_norm": 0.6993852271462582,
+      "learning_rate": 5e-06,
+      "loss": 0.7633,
+      "step": 370
+    },
+    {
+      "epoch": 0.38138251160456654,
+      "grad_norm": 0.8808430833699983,
+      "learning_rate": 5e-06,
+      "loss": 0.7689,
+      "step": 380
+    },
+    {
+      "epoch": 0.39141889348889725,
+      "grad_norm": 0.8240632882958299,
+      "learning_rate": 5e-06,
+      "loss": 0.7595,
+      "step": 390
+    },
+    {
+      "epoch": 0.40145527537322795,
+      "grad_norm": 0.6461166154976471,
+      "learning_rate": 5e-06,
+      "loss": 0.758,
+      "step": 400
+    },
+    {
+      "epoch": 0.41149165725755865,
+      "grad_norm": 0.6015328676217312,
+      "learning_rate": 5e-06,
+      "loss": 0.7615,
+      "step": 410
+    },
+    {
+      "epoch": 0.42152803914188935,
+      "grad_norm": 0.6844312177243449,
+      "learning_rate": 5e-06,
+      "loss": 0.7604,
+      "step": 420
+    },
+    {
+      "epoch": 0.43156442102622006,
+      "grad_norm": 0.5903552392115103,
+      "learning_rate": 5e-06,
+      "loss": 0.762,
+      "step": 430
+    },
+    {
+      "epoch": 0.44160080291055076,
+      "grad_norm": 0.6188002518762006,
+      "learning_rate": 5e-06,
+      "loss": 0.7608,
+      "step": 440
+    },
+    {
+      "epoch": 0.45163718479488146,
+      "grad_norm": 0.5972767152570548,
+      "learning_rate": 5e-06,
+      "loss": 0.7498,
+      "step": 450
+    },
+    {
+      "epoch": 0.46167356667921217,
+      "grad_norm": 0.8557407016909138,
+      "learning_rate": 5e-06,
+      "loss": 0.7557,
+      "step": 460
+    },
+    {
+      "epoch": 0.47170994856354287,
+      "grad_norm": 0.5850889857419986,
+      "learning_rate": 5e-06,
+      "loss": 0.7506,
+      "step": 470
+    },
+    {
+      "epoch": 0.4817463304478735,
+      "grad_norm": 0.6245054223045923,
+      "learning_rate": 5e-06,
+      "loss": 0.7577,
+      "step": 480
+    },
+    {
+      "epoch": 0.4917827123322042,
+      "grad_norm": 0.8464030388596748,
+      "learning_rate": 5e-06,
+      "loss": 0.7524,
+      "step": 490
+    },
+    {
+      "epoch": 0.5018190942165349,
+      "grad_norm": 0.5975779064483824,
+      "learning_rate": 5e-06,
+      "loss": 0.7542,
+      "step": 500
+    },
+    {
+      "epoch": 0.5118554761008657,
+      "grad_norm": 0.5557828388284775,
+      "learning_rate": 5e-06,
+      "loss": 0.7562,
+      "step": 510
+    },
+    {
+      "epoch": 0.5218918579851963,
+      "grad_norm": 0.5041527087050589,
+      "learning_rate": 5e-06,
+      "loss": 0.7505,
+      "step": 520
+    },
+    {
+      "epoch": 0.5319282398695271,
+      "grad_norm": 0.606175554740277,
+      "learning_rate": 5e-06,
+      "loss": 0.7488,
+      "step": 530
+    },
+    {
+      "epoch": 0.5419646217538577,
+      "grad_norm": 0.6932655836595772,
+      "learning_rate": 5e-06,
+      "loss": 0.7459,
+      "step": 540
+    },
+    {
+      "epoch": 0.5520010036381884,
+      "grad_norm": 0.6342619574494085,
+      "learning_rate": 5e-06,
+      "loss": 0.748,
+      "step": 550
+    },
+    {
+      "epoch": 0.5620373855225191,
+      "grad_norm": 0.7273685186875936,
+      "learning_rate": 5e-06,
+      "loss": 0.7514,
+      "step": 560
+    },
+    {
+      "epoch": 0.5720737674068498,
+      "grad_norm": 0.6175109686722693,
+      "learning_rate": 5e-06,
+      "loss": 0.7487,
+      "step": 570
+    },
+    {
+      "epoch": 0.5821101492911805,
+      "grad_norm": 0.5906521692541239,
+      "learning_rate": 5e-06,
+      "loss": 0.7444,
+      "step": 580
+    },
+    {
+      "epoch": 0.5921465311755112,
+      "grad_norm": 0.5770083684995156,
+      "learning_rate": 5e-06,
+      "loss": 0.7463,
+      "step": 590
+    },
+    {
+      "epoch": 0.602182913059842,
+      "grad_norm": 0.5611205193626524,
+      "learning_rate": 5e-06,
+      "loss": 0.7501,
+      "step": 600
+    },
+    {
+      "epoch": 0.6122192949441726,
+      "grad_norm": 0.5626260973998957,
+      "learning_rate": 5e-06,
+      "loss": 0.7476,
+      "step": 610
+    },
+    {
+      "epoch": 0.6222556768285034,
+      "grad_norm": 0.5359369641368554,
+      "learning_rate": 5e-06,
+      "loss": 0.7438,
+      "step": 620
+    },
+    {
+      "epoch": 0.632292058712834,
+      "grad_norm": 0.5986655577379103,
+      "learning_rate": 5e-06,
+      "loss": 0.7486,
+      "step": 630
+    },
+    {
+      "epoch": 0.6423284405971648,
+      "grad_norm": 0.6386177918361722,
+      "learning_rate": 5e-06,
+      "loss": 0.7485,
+      "step": 640
+    },
+    {
+      "epoch": 0.6523648224814954,
+      "grad_norm": 0.5978315440058417,
+      "learning_rate": 5e-06,
+      "loss": 0.7433,
+      "step": 650
+    },
+    {
+      "epoch": 0.6624012043658262,
+      "grad_norm": 0.6036098551107931,
+      "learning_rate": 5e-06,
+      "loss": 0.7455,
+      "step": 660
+    },
+    {
+      "epoch": 0.6724375862501568,
+      "grad_norm": 0.5636202992429957,
+      "learning_rate": 5e-06,
+      "loss": 0.7466,
+      "step": 670
+    },
+    {
+      "epoch": 0.6824739681344875,
+      "grad_norm": 0.702719868075261,
+      "learning_rate": 5e-06,
+      "loss": 0.7423,
+      "step": 680
+    },
+    {
+      "epoch": 0.6925103500188182,
+      "grad_norm": 0.5833095780838476,
+      "learning_rate": 5e-06,
+      "loss": 0.7398,
+      "step": 690
+    },
+    {
+      "epoch": 0.7025467319031489,
+      "grad_norm": 0.6308085334601381,
+      "learning_rate": 5e-06,
+      "loss": 0.7429,
+      "step": 700
+    },
+    {
+      "epoch": 0.7125831137874796,
+      "grad_norm": 0.5487784356318717,
+      "learning_rate": 5e-06,
+      "loss": 0.7411,
+      "step": 710
+    },
+    {
+      "epoch": 0.7226194956718103,
+      "grad_norm": 0.5865151577104253,
+      "learning_rate": 5e-06,
+      "loss": 0.7428,
+      "step": 720
+    },
+    {
+      "epoch": 0.732655877556141,
+      "grad_norm": 0.6872870216284236,
+      "learning_rate": 5e-06,
+      "loss": 0.7431,
+      "step": 730
+    },
+    {
+      "epoch": 0.7426922594404717,
+      "grad_norm": 0.7183251325599549,
+      "learning_rate": 5e-06,
+      "loss": 0.7423,
+      "step": 740
+    },
+    {
+      "epoch": 0.7527286413248024,
+      "grad_norm": 0.7454754048807214,
+      "learning_rate": 5e-06,
+      "loss": 0.7387,
+      "step": 750
+    },
+    {
+      "epoch": 0.7627650232091331,
+      "grad_norm": 0.5531190685288089,
+      "learning_rate": 5e-06,
+      "loss": 0.743,
+      "step": 760
+    },
+    {
+      "epoch": 0.7728014050934638,
+      "grad_norm": 0.5534601111291695,
+      "learning_rate": 5e-06,
+      "loss": 0.741,
+      "step": 770
+    },
+    {
+      "epoch": 0.7828377869777945,
+      "grad_norm": 0.5724229278027059,
+      "learning_rate": 5e-06,
+      "loss": 0.742,
+      "step": 780
+    },
+    {
+      "epoch": 0.7928741688621253,
+      "grad_norm": 0.5444775257450564,
+      "learning_rate": 5e-06,
+      "loss": 0.7375,
+      "step": 790
+    },
+    {
+      "epoch": 0.8029105507464559,
+      "grad_norm": 0.5172792176589391,
+      "learning_rate": 5e-06,
+      "loss": 0.7387,
+      "step": 800
+    },
+    {
+      "epoch": 0.8129469326307867,
+      "grad_norm": 0.5788117817392925,
+      "learning_rate": 5e-06,
+      "loss": 0.7376,
+      "step": 810
+    },
+    {
+      "epoch": 0.8229833145151173,
+      "grad_norm": 0.5261510224866858,
+      "learning_rate": 5e-06,
+      "loss": 0.739,
+      "step": 820
+    },
+    {
+      "epoch": 0.833019696399448,
+      "grad_norm": 0.5280046534368359,
+      "learning_rate": 5e-06,
+      "loss": 0.7359,
+      "step": 830
+    },
+    {
+      "epoch": 0.8430560782837787,
+      "grad_norm": 0.5641435748973659,
+      "learning_rate": 5e-06,
+      "loss": 0.7349,
+      "step": 840
+    },
+    {
+      "epoch": 0.8530924601681094,
+      "grad_norm": 0.557571684728077,
+      "learning_rate": 5e-06,
+      "loss": 0.7365,
+      "step": 850
+    },
+    {
+      "epoch": 0.8631288420524401,
+      "grad_norm": 0.6296144133240885,
+      "learning_rate": 5e-06,
+      "loss": 0.7373,
+      "step": 860
+    },
+    {
+      "epoch": 0.8731652239367708,
+      "grad_norm": 0.5966607955551202,
+      "learning_rate": 5e-06,
+      "loss": 0.7353,
+      "step": 870
+    },
+    {
+      "epoch": 0.8832016058211015,
+      "grad_norm": 0.5563914474354427,
+      "learning_rate": 5e-06,
+      "loss": 0.7353,
+      "step": 880
+    },
+    {
+      "epoch": 0.8932379877054322,
+      "grad_norm": 0.536620041860774,
+      "learning_rate": 5e-06,
+      "loss": 0.7338,
+      "step": 890
+    },
+    {
+      "epoch": 0.9032743695897629,
+      "grad_norm": 0.5387289422962349,
+      "learning_rate": 5e-06,
+      "loss": 0.7372,
+      "step": 900
+    },
+    {
+      "epoch": 0.9133107514740936,
+      "grad_norm": 0.6013347978585226,
+      "learning_rate": 5e-06,
+      "loss": 0.7337,
+      "step": 910
+    },
+    {
+      "epoch": 0.9233471333584243,
+      "grad_norm": 0.583886043468759,
+      "learning_rate": 5e-06,
+      "loss": 0.7336,
+      "step": 920
+    },
+    {
+      "epoch": 0.933383515242755,
+      "grad_norm": 0.5382696935521819,
+      "learning_rate": 5e-06,
+      "loss": 0.7368,
+      "step": 930
+    },
+    {
+      "epoch": 0.9434198971270857,
+      "grad_norm": 0.5653997036590966,
+      "learning_rate": 5e-06,
+      "loss": 0.7345,
+      "step": 940
+    },
+    {
+      "epoch": 0.9534562790114164,
+      "grad_norm": 0.6567632931315763,
+      "learning_rate": 5e-06,
+      "loss": 0.7304,
+      "step": 950
+    },
+    {
+      "epoch": 0.963492660895747,
+      "grad_norm": 0.6418916205752738,
+      "learning_rate": 5e-06,
+      "loss": 0.7301,
+      "step": 960
+    },
+    {
+      "epoch": 0.9735290427800778,
+      "grad_norm": 0.6530499607318279,
+      "learning_rate": 5e-06,
+      "loss": 0.7387,
+      "step": 970
+    },
+    {
+      "epoch": 0.9835654246644084,
+      "grad_norm": 0.5551081098759132,
+      "learning_rate": 5e-06,
+      "loss": 0.7333,
+      "step": 980
+    },
+    {
+      "epoch": 0.9936018065487392,
+      "grad_norm": 0.6077836691860066,
+      "learning_rate": 5e-06,
+      "loss": 0.7336,
+      "step": 990
+    },
+    {
+      "epoch": 0.9996236356793377,
+      "eval_loss": 0.7306540608406067,
+      "eval_runtime": 710.0823,
+      "eval_samples_per_second": 37.811,
+      "eval_steps_per_second": 0.591,
+      "step": 996
+    },
+    {
+      "epoch": 1.0036381884330698,
+      "grad_norm": 0.7501884083987931,
+      "learning_rate": 5e-06,
+      "loss": 0.7631,
+      "step": 1000
+    },
+    {
+      "epoch": 1.0136745703174006,
+      "grad_norm": 0.6377477928653016,
+      "learning_rate": 5e-06,
+      "loss": 0.6949,
+      "step": 1010
+    },
+    {
+      "epoch": 1.0237109522017314,
+      "grad_norm": 0.5049710429815633,
+      "learning_rate": 5e-06,
+      "loss": 0.6968,
+      "step": 1020
+    },
+    {
+      "epoch": 1.033747334086062,
+      "grad_norm": 0.6837967725702919,
+      "learning_rate": 5e-06,
+      "loss": 0.6905,
+      "step": 1030
+    },
+    {
+      "epoch": 1.0437837159703927,
+      "grad_norm": 0.5356223402685194,
+      "learning_rate": 5e-06,
+      "loss": 0.6922,
+      "step": 1040
+    },
+    {
+      "epoch": 1.0538200978547234,
+      "grad_norm": 0.8370676577254016,
+      "learning_rate": 5e-06,
+      "loss": 0.6901,
+      "step": 1050
+    },
+    {
+      "epoch": 1.0638564797390542,
+      "grad_norm": 0.6105636596448625,
+      "learning_rate": 5e-06,
+      "loss": 0.6922,
+      "step": 1060
+    },
+    {
+      "epoch": 1.0738928616233847,
+      "grad_norm": 0.5927877282574158,
+      "learning_rate": 5e-06,
+      "loss": 0.6979,
+      "step": 1070
+    },
+    {
+      "epoch": 1.0839292435077155,
+      "grad_norm": 0.540102358596148,
+      "learning_rate": 5e-06,
+      "loss": 0.6958,
+      "step": 1080
+    },
+    {
+      "epoch": 1.0939656253920462,
+      "grad_norm": 0.59126294445945,
+      "learning_rate": 5e-06,
+      "loss": 0.6931,
+      "step": 1090
+    },
+    {
+      "epoch": 1.1040020072763768,
+      "grad_norm": 0.6608825970633089,
+      "learning_rate": 5e-06,
+      "loss": 0.6937,
+      "step": 1100
+    },
+    {
+      "epoch": 1.1140383891607075,
+      "grad_norm": 0.5164144607050836,
+      "learning_rate": 5e-06,
+      "loss": 0.6917,
+      "step": 1110
+    },
+    {
+      "epoch": 1.1240747710450383,
+      "grad_norm": 0.5814527717684509,
+      "learning_rate": 5e-06,
+      "loss": 0.6972,
+      "step": 1120
+    },
+    {
+      "epoch": 1.134111152929369,
+      "grad_norm": 0.8029370829094067,
+      "learning_rate": 5e-06,
+      "loss": 0.6911,
+      "step": 1130
+    },
+    {
+      "epoch": 1.1441475348136996,
+      "grad_norm": 0.5560867302991094,
+      "learning_rate": 5e-06,
+      "loss": 0.6934,
+      "step": 1140
+    },
+    {
+      "epoch": 1.1541839166980303,
+      "grad_norm": 0.5562556075209857,
+      "learning_rate": 5e-06,
+      "loss": 0.6966,
+      "step": 1150
+    },
+    {
+      "epoch": 1.164220298582361,
+      "grad_norm": 0.5466240170135728,
+      "learning_rate": 5e-06,
+      "loss": 0.6905,
+      "step": 1160
+    },
+    {
+      "epoch": 1.1742566804666918,
+      "grad_norm": 0.6181691352555871,
+      "learning_rate": 5e-06,
+      "loss": 0.6949,
+      "step": 1170
+    },
+    {
+      "epoch": 1.1842930623510224,
+      "grad_norm": 0.5816875585540926,
+      "learning_rate": 5e-06,
+      "loss": 0.6982,
+      "step": 1180
+    },
+    {
+      "epoch": 1.1943294442353531,
+      "grad_norm": 0.6682656613454141,
+      "learning_rate": 5e-06,
+      "loss": 0.6975,
+      "step": 1190
+    },
+    {
+      "epoch": 1.204365826119684,
+      "grad_norm": 0.5227039044223272,
+      "learning_rate": 5e-06,
+      "loss": 0.6938,
+      "step": 1200
+    },
+    {
+      "epoch": 1.2144022080040147,
+      "grad_norm": 0.5934796942110975,
+      "learning_rate": 5e-06,
+      "loss": 0.6928,
+      "step": 1210
+    },
+    {
+      "epoch": 1.2244385898883452,
+      "grad_norm": 0.7797726771495517,
+      "learning_rate": 5e-06,
+      "loss": 0.6966,
+      "step": 1220
+    },
+    {
+      "epoch": 1.234474971772676,
+      "grad_norm": 0.529604849325101,
+      "learning_rate": 5e-06,
+      "loss": 0.6923,
+      "step": 1230
+    },
+    {
+      "epoch": 1.2445113536570067,
+      "grad_norm": 0.6023982359465729,
+      "learning_rate": 5e-06,
+      "loss": 0.6945,
+      "step": 1240
+    },
+    {
+      "epoch": 1.2545477355413372,
+      "grad_norm": 0.5560385701612578,
+      "learning_rate": 5e-06,
+      "loss": 0.6944,
+      "step": 1250
+    },
+    {
+      "epoch": 1.264584117425668,
+      "grad_norm": 0.5283876353062209,
+      "learning_rate": 5e-06,
+      "loss": 0.6959,
+      "step": 1260
+    },
+    {
+      "epoch": 1.2746204993099988,
+      "grad_norm": 0.673569525320333,
+      "learning_rate": 5e-06,
+      "loss": 0.6957,
+      "step": 1270
+    },
+    {
+      "epoch": 1.2846568811943295,
+      "grad_norm": 0.5956349315336121,
+      "learning_rate": 5e-06,
+      "loss": 0.6919,
+      "step": 1280
+    },
+    {
+      "epoch": 1.29469326307866,
+      "grad_norm": 0.5672304692349656,
+      "learning_rate": 5e-06,
+      "loss": 0.6886,
+      "step": 1290
+    },
+    {
+      "epoch": 1.3047296449629908,
+      "grad_norm": 0.5877274033488099,
+      "learning_rate": 5e-06,
+      "loss": 0.6962,
+      "step": 1300
+    },
+    {
+      "epoch": 1.3147660268473216,
+      "grad_norm": 0.6060436152781087,
+      "learning_rate": 5e-06,
+      "loss": 0.6972,
+      "step": 1310
+    },
+    {
+      "epoch": 1.3248024087316521,
+      "grad_norm": 0.566578140937884,
+      "learning_rate": 5e-06,
+      "loss": 0.6929,
+      "step": 1320
+    },
+    {
+      "epoch": 1.3348387906159829,
+      "grad_norm": 0.5294558357936279,
+      "learning_rate": 5e-06,
+      "loss": 0.6879,
+      "step": 1330
+    },
+    {
+      "epoch": 1.3448751725003136,
+      "grad_norm": 0.6237162250745834,
+      "learning_rate": 5e-06,
+      "loss": 0.6963,
+      "step": 1340
+    },
+    {
+      "epoch": 1.3549115543846444,
+      "grad_norm": 0.5430804692697989,
+      "learning_rate": 5e-06,
+      "loss": 0.6879,
+      "step": 1350
+    },
+    {
+      "epoch": 1.3649479362689751,
+      "grad_norm": 0.7283462410509066,
+      "learning_rate": 5e-06,
+      "loss": 0.6976,
+      "step": 1360
+    },
+    {
+      "epoch": 1.3749843181533057,
+      "grad_norm": 0.6917759789030041,
+      "learning_rate": 5e-06,
+      "loss": 0.6852,
+      "step": 1370
+    },
+    {
+      "epoch": 1.3850207000376364,
+      "grad_norm": 0.5567789537021975,
+      "learning_rate": 5e-06,
+      "loss": 0.6948,
+      "step": 1380
+    },
+    {
+      "epoch": 1.3950570819219672,
+      "grad_norm": 0.5454605064190993,
+      "learning_rate": 5e-06,
+      "loss": 0.6954,
+      "step": 1390
+    },
+    {
+      "epoch": 1.4050934638062977,
+      "grad_norm": 0.5900124519339845,
+      "learning_rate": 5e-06,
+      "loss": 0.6921,
+      "step": 1400
+    },
+    {
+      "epoch": 1.4151298456906285,
+      "grad_norm": 0.5690895340205371,
+      "learning_rate": 5e-06,
+      "loss": 0.6907,
+      "step": 1410
+    },
+    {
+      "epoch": 1.4251662275749593,
+      "grad_norm": 0.5672995415143901,
+      "learning_rate": 5e-06,
+      "loss": 0.6924,
+      "step": 1420
+    },
+    {
+      "epoch": 1.43520260945929,
+      "grad_norm": 0.5311142120372314,
+      "learning_rate": 5e-06,
+      "loss": 0.696,
+      "step": 1430
+    },
+    {
+      "epoch": 1.4452389913436205,
+      "grad_norm": 0.5999039652092255,
+      "learning_rate": 5e-06,
+      "loss": 0.6902,
+      "step": 1440
+    },
+    {
+      "epoch": 1.4552753732279513,
+      "grad_norm": 0.5933508701370325,
+      "learning_rate": 5e-06,
+      "loss": 0.6921,
+      "step": 1450
+    },
+    {
+      "epoch": 1.465311755112282,
+      "grad_norm": 0.5165015340634761,
+      "learning_rate": 5e-06,
+      "loss": 0.6926,
+      "step": 1460
+    },
+    {
+      "epoch": 1.4753481369966126,
+      "grad_norm": 0.542825456215961,
+      "learning_rate": 5e-06,
+      "loss": 0.6924,
+      "step": 1470
+    },
+    {
+      "epoch": 1.4853845188809434,
+      "grad_norm": 0.5173942736182007,
+      "learning_rate": 5e-06,
+      "loss": 0.6889,
+      "step": 1480
+    },
+    {
+      "epoch": 1.4954209007652741,
+      "grad_norm": 0.5444379147874959,
+      "learning_rate": 5e-06,
+      "loss": 0.6895,
+      "step": 1490
+    },
+    {
+      "epoch": 1.5054572826496049,
+      "grad_norm": 0.5186245031758688,
+      "learning_rate": 5e-06,
+      "loss": 0.6955,
+      "step": 1500
+    },
+    {
+      "epoch": 1.5154936645339356,
+      "grad_norm": 0.5273924158023976,
+      "learning_rate": 5e-06,
+      "loss": 0.6971,
+      "step": 1510
+    },
+    {
+      "epoch": 1.5255300464182662,
+      "grad_norm": 0.5676022244683365,
+      "learning_rate": 5e-06,
+      "loss": 0.695,
+      "step": 1520
+    },
+    {
+      "epoch": 1.535566428302597,
+      "grad_norm": 0.6271761201828187,
+      "learning_rate": 5e-06,
+      "loss": 0.6917,
+      "step": 1530
+    },
+    {
+      "epoch": 1.5456028101869275,
+      "grad_norm": 0.6124159982732021,
+      "learning_rate": 5e-06,
+      "loss": 0.6911,
+      "step": 1540
+    },
+    {
+      "epoch": 1.5556391920712582,
+      "grad_norm": 0.6862261314401948,
+      "learning_rate": 5e-06,
+      "loss": 0.6864,
+      "step": 1550
+    },
+    {
+      "epoch": 1.565675573955589,
+      "grad_norm": 0.6275955880807454,
+      "learning_rate": 5e-06,
+      "loss": 0.6936,
+      "step": 1560
+    },
+    {
+      "epoch": 1.5757119558399197,
+      "grad_norm": 0.6289883161946416,
+      "learning_rate": 5e-06,
+      "loss": 0.6894,
+      "step": 1570
+    },
+    {
+      "epoch": 1.5857483377242505,
+      "grad_norm": 0.580105480957792,
+      "learning_rate": 5e-06,
+      "loss": 0.693,
+      "step": 1580
+    },
+    {
+      "epoch": 1.5957847196085813,
+      "grad_norm": 0.5560879464825351,
+      "learning_rate": 5e-06,
+      "loss": 0.692,
+      "step": 1590
+    },
+    {
+      "epoch": 1.6058211014929118,
+      "grad_norm": 0.58969515178897,
+      "learning_rate": 5e-06,
+      "loss": 0.6916,
+      "step": 1600
+    },
+    {
+      "epoch": 1.6158574833772426,
+      "grad_norm": 0.626694720233302,
+      "learning_rate": 5e-06,
+      "loss": 0.6867,
+      "step": 1610
+    },
+    {
+      "epoch": 1.625893865261573,
+      "grad_norm": 0.5549455637786119,
+      "learning_rate": 5e-06,
+      "loss": 0.6897,
+      "step": 1620
+    },
+    {
+      "epoch": 1.6359302471459038,
+      "grad_norm": 0.5938553058669086,
+      "learning_rate": 5e-06,
+      "loss": 0.6921,
+      "step": 1630
+    },
+    {
+      "epoch": 1.6459666290302346,
+      "grad_norm": 0.5753903253177306,
+      "learning_rate": 5e-06,
+      "loss": 0.6848,
+      "step": 1640
+    },
+    {
+      "epoch": 1.6560030109145654,
+      "grad_norm": 0.5191405989431156,
+      "learning_rate": 5e-06,
+      "loss": 0.69,
+      "step": 1650
+    },
+    {
+      "epoch": 1.6660393927988961,
+      "grad_norm": 0.5561359520494403,
+      "learning_rate": 5e-06,
+      "loss": 0.6986,
+      "step": 1660
+    },
+    {
+      "epoch": 1.6760757746832267,
+      "grad_norm": 0.6203674148098636,
+      "learning_rate": 5e-06,
+      "loss": 0.6908,
+      "step": 1670
+    },
+    {
+      "epoch": 1.6861121565675574,
+      "grad_norm": 0.6125267283499654,
+      "learning_rate": 5e-06,
+      "loss": 0.6914,
+      "step": 1680
+    },
+    {
+      "epoch": 1.696148538451888,
+      "grad_norm": 0.6670104437553405,
+      "learning_rate": 5e-06,
+      "loss": 0.6921,
+      "step": 1690
+    },
+    {
+      "epoch": 1.7061849203362187,
+      "grad_norm": 0.6164770721352887,
+      "learning_rate": 5e-06,
+      "loss": 0.6906,
+      "step": 1700
+    },
+    {
+      "epoch": 1.7162213022205495,
+      "grad_norm": 0.5419884156579401,
+      "learning_rate": 5e-06,
+      "loss": 0.6943,
+      "step": 1710
+    },
+    {
+      "epoch": 1.7262576841048802,
+      "grad_norm": 0.5735849533542166,
+      "learning_rate": 5e-06,
+      "loss": 0.69,
+      "step": 1720
+    },
+    {
+      "epoch": 1.736294065989211,
+      "grad_norm": 0.5297233454269522,
+      "learning_rate": 5e-06,
+      "loss": 0.6892,
+      "step": 1730
+    },
+    {
+      "epoch": 1.7463304478735417,
+      "grad_norm": 0.5393616139715296,
+      "learning_rate": 5e-06,
+      "loss": 0.6915,
+      "step": 1740
+    },
+    {
+      "epoch": 1.7563668297578723,
+      "grad_norm": 0.5643636572625912,
+      "learning_rate": 5e-06,
+      "loss": 0.6946,
+      "step": 1750
+    },
+    {
+      "epoch": 1.7664032116422028,
+      "grad_norm": 0.5884196950446513,
+      "learning_rate": 5e-06,
+      "loss": 0.6888,
+      "step": 1760
+    },
+    {
+      "epoch": 1.7764395935265336,
+      "grad_norm": 0.5236101003130557,
+      "learning_rate": 5e-06,
+      "loss": 0.687,
+      "step": 1770
+    },
+    {
+      "epoch": 1.7864759754108643,
+      "grad_norm": 0.5035132462272621,
+      "learning_rate": 5e-06,
+      "loss": 0.6888,
+      "step": 1780
+    },
+    {
+      "epoch": 1.796512357295195,
+      "grad_norm": 0.503023782309955,
+      "learning_rate": 5e-06,
+      "loss": 0.697,
+      "step": 1790
+    },
+    {
+      "epoch": 1.8065487391795259,
+      "grad_norm": 0.6043844268122752,
+      "learning_rate": 5e-06,
+      "loss": 0.6939,
+      "step": 1800
+    },
+    {
+      "epoch": 1.8165851210638566,
+      "grad_norm": 0.5232601740320375,
+      "learning_rate": 5e-06,
+      "loss": 0.6904,
+      "step": 1810
+    },
+    {
+      "epoch": 1.8266215029481871,
+      "grad_norm": 0.5370830872283877,
+      "learning_rate": 5e-06,
+      "loss": 0.6911,
+      "step": 1820
+    },
+    {
+      "epoch": 1.836657884832518,
+      "grad_norm": 0.6459700462195024,
+      "learning_rate": 5e-06,
+      "loss": 0.6882,
+      "step": 1830
+    },
+    {
+      "epoch": 1.8466942667168484,
+      "grad_norm": 0.582019267718641,
+      "learning_rate": 5e-06,
+      "loss": 0.6922,
+      "step": 1840
+    },
+    {
+      "epoch": 1.8567306486011792,
+      "grad_norm": 0.5870687122095531,
+      "learning_rate": 5e-06,
+      "loss": 0.6922,
+      "step": 1850
+    },
+    {
+      "epoch": 1.86676703048551,
+      "grad_norm": 0.5078692726109669,
+      "learning_rate": 5e-06,
+      "loss": 0.6885,
+      "step": 1860
+    },
+    {
+      "epoch": 1.8768034123698407,
+      "grad_norm": 0.5040649883447891,
+      "learning_rate": 5e-06,
+      "loss": 0.6881,
+      "step": 1870
+    },
+    {
+      "epoch": 1.8868397942541715,
+      "grad_norm": 0.4995959267347782,
+      "learning_rate": 5e-06,
+      "loss": 0.6862,
+      "step": 1880
+    },
+    {
+      "epoch": 1.8968761761385022,
+      "grad_norm": 0.5881241901658035,
+      "learning_rate": 5e-06,
+      "loss": 0.6917,
+      "step": 1890
+    },
+    {
+      "epoch": 1.9069125580228328,
+      "grad_norm": 0.5588343821948456,
+      "learning_rate": 5e-06,
+      "loss": 0.6888,
+      "step": 1900
+    },
+    {
+      "epoch": 1.9169489399071633,
+      "grad_norm": 0.4870751715972866,
+      "learning_rate": 5e-06,
+      "loss": 0.6872,
+      "step": 1910
+    },
+    {
+      "epoch": 1.926985321791494,
+      "grad_norm": 0.5735655591316463,
+      "learning_rate": 5e-06,
+      "loss": 0.6947,
+      "step": 1920
+    },
+    {
+      "epoch": 1.9370217036758248,
+      "grad_norm": 0.6085871436334919,
+      "learning_rate": 5e-06,
+      "loss": 0.6888,
+      "step": 1930
+    },
+    {
+      "epoch": 1.9470580855601556,
+      "grad_norm": 0.6006262589490265,
+      "learning_rate": 5e-06,
+      "loss": 0.6884,
+      "step": 1940
+    },
+    {
+      "epoch": 1.9570944674444863,
+      "grad_norm": 0.6341474482034111,
+      "learning_rate": 5e-06,
+      "loss": 0.6886,
+      "step": 1950
+    },
+    {
+      "epoch": 1.967130849328817,
+      "grad_norm": 0.5362807629078432,
+      "learning_rate": 5e-06,
+      "loss": 0.6873,
+      "step": 1960
+    },
+    {
+      "epoch": 1.9771672312131476,
+      "grad_norm": 0.4940952174490118,
+      "learning_rate": 5e-06,
+      "loss": 0.6895,
+      "step": 1970
+    },
+    {
+      "epoch": 1.9872036130974784,
+      "grad_norm": 0.5068555383392377,
+      "learning_rate": 5e-06,
+      "loss": 0.6886,
+      "step": 1980
+    },
+    {
+      "epoch": 1.997239994981809,
+      "grad_norm": 0.5115531434616939,
+      "learning_rate": 5e-06,
+      "loss": 0.6923,
+      "step": 1990
+    },
+    {
+      "epoch": 1.999247271358675,
+      "eval_loss": 0.7167317867279053,
+      "eval_runtime": 681.5165,
+      "eval_samples_per_second": 39.396,
+      "eval_steps_per_second": 0.616,
+      "step": 1992
+    },
+    {
+      "epoch": 2.0072763768661397,
+      "grad_norm": 0.7464092668086445,
+      "learning_rate": 5e-06,
+      "loss": 0.7021,
+      "step": 2000
+    },
+    {
+      "epoch": 2.0173127587504704,
+      "grad_norm": 0.6285168143241044,
+      "learning_rate": 5e-06,
+      "loss": 0.6464,
+      "step": 2010
+    },
+    {
+      "epoch": 2.027349140634801,
+      "grad_norm": 0.6153973121048567,
+      "learning_rate": 5e-06,
+      "loss": 0.6468,
+      "step": 2020
+    },
+    {
+      "epoch": 2.037385522519132,
+      "grad_norm": 0.5873302210415512,
+      "learning_rate": 5e-06,
+      "loss": 0.6466,
+      "step": 2030
+    },
+    {
+      "epoch": 2.0474219044034627,
+      "grad_norm": 0.6320791509681155,
+      "learning_rate": 5e-06,
+      "loss": 0.645,
+      "step": 2040
+    },
+    {
+      "epoch": 2.057458286287793,
+      "grad_norm": 0.7492672540101699,
+      "learning_rate": 5e-06,
+      "loss": 0.6484,
+      "step": 2050
+    },
+    {
+      "epoch": 2.067494668172124,
+      "grad_norm": 0.6049252570937941,
+      "learning_rate": 5e-06,
+      "loss": 0.6479,
+      "step": 2060
+    },
+    {
+      "epoch": 2.0775310500564546,
+      "grad_norm": 0.5658472447863507,
+      "learning_rate": 5e-06,
+      "loss": 0.646,
+      "step": 2070
+    },
+    {
+      "epoch": 2.0875674319407853,
+      "grad_norm": 0.5516870136698201,
+      "learning_rate": 5e-06,
+      "loss": 0.6492,
+      "step": 2080
+    },
+    {
+      "epoch": 2.097603813825116,
+      "grad_norm": 0.7029586116565276,
+      "learning_rate": 5e-06,
+      "loss": 0.6538,
+      "step": 2090
+    },
+    {
+      "epoch": 2.107640195709447,
+      "grad_norm": 0.5501781567288918,
+      "learning_rate": 5e-06,
+      "loss": 0.6468,
+      "step": 2100
+    },
+    {
+      "epoch": 2.1176765775937776,
+      "grad_norm": 0.5732859435604044,
+      "learning_rate": 5e-06,
+      "loss": 0.6468,
+      "step": 2110
+    },
+    {
+      "epoch": 2.1277129594781083,
+      "grad_norm": 0.6097094231081762,
+      "learning_rate": 5e-06,
+      "loss": 0.645,
+      "step": 2120
+    },
+    {
+      "epoch": 2.1377493413624387,
+      "grad_norm": 0.6168934436640728,
+      "learning_rate": 5e-06,
+      "loss": 0.6495,
+      "step": 2130
+    },
+    {
+      "epoch": 2.1477857232467694,
+      "grad_norm": 0.5791030315614372,
+      "learning_rate": 5e-06,
+      "loss": 0.6494,
+      "step": 2140
+    },
+    {
+      "epoch": 2.1578221051311,
+      "grad_norm": 0.5248289813751731,
+      "learning_rate": 5e-06,
+      "loss": 0.6502,
+      "step": 2150
+    },
+    {
+      "epoch": 2.167858487015431,
+      "grad_norm": 0.5339345181224099,
+      "learning_rate": 5e-06,
+      "loss": 0.6469,
+      "step": 2160
+    },
+    {
+      "epoch": 2.1778948688997617,
+      "grad_norm": 0.5503349261990413,
+      "learning_rate": 5e-06,
+      "loss": 0.6445,
+      "step": 2170
+    },
+    {
+      "epoch": 2.1879312507840925,
+      "grad_norm": 0.5464800148102408,
+      "learning_rate": 5e-06,
+      "loss": 0.6506,
+      "step": 2180
+    },
+    {
+      "epoch": 2.197967632668423,
+      "grad_norm": 0.5247947975936182,
+      "learning_rate": 5e-06,
+      "loss": 0.6527,
+      "step": 2190
+    },
+    {
+      "epoch": 2.2080040145527535,
+      "grad_norm": 0.5606999830634875,
+      "learning_rate": 5e-06,
+      "loss": 0.6493,
+      "step": 2200
+    },
+    {
+      "epoch": 2.2180403964370843,
+      "grad_norm": 0.5331049457160831,
+      "learning_rate": 5e-06,
+      "loss": 0.6537,
+      "step": 2210
+    },
+    {
+      "epoch": 2.228076778321415,
+      "grad_norm": 0.5988937144831796,
+      "learning_rate": 5e-06,
+      "loss": 0.6491,
+      "step": 2220
+    },
+    {
+      "epoch": 2.238113160205746,
+      "grad_norm": 0.5493199529484449,
+      "learning_rate": 5e-06,
+      "loss": 0.6497,
+      "step": 2230
+    },
+    {
+      "epoch": 2.2481495420900766,
+      "grad_norm": 0.5391276137070077,
+      "learning_rate": 5e-06,
+      "loss": 0.6479,
+      "step": 2240
+    },
+    {
+      "epoch": 2.2581859239744073,
+      "grad_norm": 0.5468874755147779,
+      "learning_rate": 5e-06,
+      "loss": 0.6481,
+      "step": 2250
+    },
+    {
+      "epoch": 2.268222305858738,
+      "grad_norm": 0.6698860969319781,
+      "learning_rate": 5e-06,
+      "loss": 0.6496,
+      "step": 2260
+    },
+    {
+      "epoch": 2.278258687743069,
+      "grad_norm": 0.5534091734548499,
+      "learning_rate": 5e-06,
+      "loss": 0.6495,
+      "step": 2270
+    },
+    {
+      "epoch": 2.288295069627399,
+      "grad_norm": 0.5251666746216365,
+      "learning_rate": 5e-06,
+      "loss": 0.6488,
+      "step": 2280
+    },
+    {
+      "epoch": 2.29833145151173,
+      "grad_norm": 0.6041634029692523,
+      "learning_rate": 5e-06,
+      "loss": 0.6463,
+      "step": 2290
+    },
+    {
+      "epoch": 2.3083678333960607,
+      "grad_norm": 0.6700682402301694,
+      "learning_rate": 5e-06,
+      "loss": 0.6512,
+      "step": 2300
+    },
+    {
+      "epoch": 2.3184042152803914,
+      "grad_norm": 0.5507167873654453,
+      "learning_rate": 5e-06,
+      "loss": 0.6531,
+      "step": 2310
+    },
+    {
+      "epoch": 2.328440597164722,
+      "grad_norm": 0.7110576199813733,
+      "learning_rate": 5e-06,
+      "loss": 0.651,
+      "step": 2320
+    },
+    {
+      "epoch": 2.338476979049053,
+      "grad_norm": 0.547011003517586,
+      "learning_rate": 5e-06,
+      "loss": 0.6496,
+      "step": 2330
+    },
+    {
+      "epoch": 2.3485133609333837,
+      "grad_norm": 0.5277892887616297,
+      "learning_rate": 5e-06,
+      "loss": 0.6478,
+      "step": 2340
+    },
+    {
+      "epoch": 2.358549742817714,
+      "grad_norm": 0.5418890015733175,
+      "learning_rate": 5e-06,
+      "loss": 0.6474,
+      "step": 2350
+    },
+    {
+      "epoch": 2.3685861247020448,
+      "grad_norm": 0.5453644592494074,
+      "learning_rate": 5e-06,
+      "loss": 0.6513,
+      "step": 2360
+    },
+    {
+      "epoch": 2.3786225065863755,
+      "grad_norm": 0.5570648512157045,
+      "learning_rate": 5e-06,
+      "loss": 0.6549,
+      "step": 2370
+    },
+    {
+      "epoch": 2.3886588884707063,
+      "grad_norm": 0.6533123491533362,
+      "learning_rate": 5e-06,
+      "loss": 0.6517,
+      "step": 2380
+    },
+    {
+      "epoch": 2.398695270355037,
+      "grad_norm": 0.5514173498092635,
+      "learning_rate": 5e-06,
+      "loss": 0.6479,
+      "step": 2390
+    },
+    {
+      "epoch": 2.408731652239368,
+      "grad_norm": 0.6169109034186734,
+      "learning_rate": 5e-06,
+      "loss": 0.6497,
+      "step": 2400
+    },
+    {
+      "epoch": 2.4187680341236986,
+      "grad_norm": 0.6092701719038681,
+      "learning_rate": 5e-06,
+      "loss": 0.6523,
+      "step": 2410
+    },
+    {
+      "epoch": 2.4288044160080293,
+      "grad_norm": 0.5995748884635175,
+      "learning_rate": 5e-06,
+      "loss": 0.653,
+      "step": 2420
+    },
+    {
+      "epoch": 2.4388407978923596,
+      "grad_norm": 0.5733910522734972,
+      "learning_rate": 5e-06,
+      "loss": 0.6482,
+      "step": 2430
+    },
+    {
+      "epoch": 2.4488771797766904,
+      "grad_norm": 0.5191638101081872,
+      "learning_rate": 5e-06,
+      "loss": 0.656,
+      "step": 2440
+    },
+    {
+      "epoch": 2.458913561661021,
+      "grad_norm": 0.8798002303204917,
+      "learning_rate": 5e-06,
+      "loss": 0.6501,
+      "step": 2450
+    },
+    {
+      "epoch": 2.468949943545352,
+      "grad_norm": 0.5055430681257954,
+      "learning_rate": 5e-06,
+      "loss": 0.6526,
+      "step": 2460
+    },
+    {
+      "epoch": 2.4789863254296827,
+      "grad_norm": 0.5305005320670955,
+      "learning_rate": 5e-06,
+      "loss": 0.6579,
+      "step": 2470
+    },
+    {
+      "epoch": 2.4890227073140134,
+      "grad_norm": 0.6181982492315344,
+      "learning_rate": 5e-06,
+      "loss": 0.6473,
+      "step": 2480
+    },
+    {
+      "epoch": 2.499059089198344,
+      "grad_norm": 0.6215467401287779,
+      "learning_rate": 5e-06,
+      "loss": 0.6511,
+      "step": 2490
+    },
+    {
+      "epoch": 2.5090954710826745,
+      "grad_norm": 0.5729138634014542,
+      "learning_rate": 5e-06,
+      "loss": 0.6577,
+      "step": 2500
+    },
+    {
+      "epoch": 2.5191318529670053,
+      "grad_norm": 0.5393679833546277,
+      "learning_rate": 5e-06,
+      "loss": 0.6536,
+      "step": 2510
+    },
+    {
+      "epoch": 2.529168234851336,
+      "grad_norm": 0.534381658436043,
+      "learning_rate": 5e-06,
+      "loss": 0.6489,
+      "step": 2520
+    },
+    {
+      "epoch": 2.5392046167356668,
+      "grad_norm": 0.539600655245499,
+      "learning_rate": 5e-06,
+      "loss": 0.6495,
+      "step": 2530
+    },
+    {
+      "epoch": 2.5492409986199975,
+      "grad_norm": 0.6226376194292436,
+      "learning_rate": 5e-06,
+      "loss": 0.651,
+      "step": 2540
+    },
+    {
+      "epoch": 2.5592773805043283,
+      "grad_norm": 0.5865717492190782,
+      "learning_rate": 5e-06,
+      "loss": 0.6541,
+      "step": 2550
+    },
+    {
+      "epoch": 2.569313762388659,
+      "grad_norm": 0.6438993538240664,
+      "learning_rate": 5e-06,
+      "loss": 0.6507,
+      "step": 2560
+    },
+    {
+      "epoch": 2.57935014427299,
+      "grad_norm": 0.7518127666950809,
+      "learning_rate": 5e-06,
+      "loss": 0.6499,
+      "step": 2570
+    },
+    {
+      "epoch": 2.58938652615732,
+      "grad_norm": 0.586864178840436,
+      "learning_rate": 5e-06,
+      "loss": 0.6616,
+      "step": 2580
+    },
+    {
+      "epoch": 2.599422908041651,
+      "grad_norm": 0.5941413788184908,
+      "learning_rate": 5e-06,
+      "loss": 0.6486,
+      "step": 2590
+    },
+    {
+      "epoch": 2.6094592899259816,
+      "grad_norm": 0.5451550588012898,
+      "learning_rate": 5e-06,
+      "loss": 0.6544,
+      "step": 2600
+    },
+    {
+      "epoch": 2.6194956718103124,
+      "grad_norm": 0.5147403638954634,
+      "learning_rate": 5e-06,
+      "loss": 0.6539,
+      "step": 2610
+    },
+    {
+      "epoch": 2.629532053694643,
+      "grad_norm": 0.5467938780782444,
+      "learning_rate": 5e-06,
+      "loss": 0.6475,
+      "step": 2620
+    },
+    {
+      "epoch": 2.639568435578974,
+      "grad_norm": 0.5458940947855774,
+      "learning_rate": 5e-06,
+      "loss": 0.6501,
+      "step": 2630
+    },
+    {
+      "epoch": 2.6496048174633042,
+      "grad_norm": 0.600288340260203,
+      "learning_rate": 5e-06,
+      "loss": 0.6533,
+      "step": 2640
+    },
+    {
+      "epoch": 2.659641199347635,
+      "grad_norm": 0.5329802788249394,
+      "learning_rate": 5e-06,
+      "loss": 0.6539,
+      "step": 2650
+    },
+    {
+      "epoch": 2.6696775812319657,
+      "grad_norm": 0.5494428968278945,
+      "learning_rate": 5e-06,
+      "loss": 0.6485,
+      "step": 2660
+    },
+    {
+      "epoch": 2.6797139631162965,
+      "grad_norm": 0.5583817948296362,
+      "learning_rate": 5e-06,
+      "loss": 0.6522,
+      "step": 2670
+    },
+    {
+      "epoch": 2.6897503450006273,
+      "grad_norm": 0.5760005041925667,
+      "learning_rate": 5e-06,
+      "loss": 0.6513,
+      "step": 2680
+    },
+    {
+      "epoch": 2.699786726884958,
+      "grad_norm": 0.6094121074922789,
+      "learning_rate": 5e-06,
+      "loss": 0.6507,
+      "step": 2690
+    },
+    {
+      "epoch": 2.709823108769289,
+      "grad_norm": 0.6168874365624798,
+      "learning_rate": 5e-06,
+      "loss": 0.6532,
+      "step": 2700
+    },
+    {
+      "epoch": 2.7198594906536195,
+      "grad_norm": 0.5204111827775895,
+      "learning_rate": 5e-06,
+      "loss": 0.652,
+      "step": 2710
+    },
+    {
+      "epoch": 2.7298958725379503,
+      "grad_norm": 0.5414555391536807,
+      "learning_rate": 5e-06,
+      "loss": 0.6529,
+      "step": 2720
+    },
+    {
+      "epoch": 2.7399322544222806,
+      "grad_norm": 0.5343532711007503,
+      "learning_rate": 5e-06,
+      "loss": 0.6551,
+      "step": 2730
+    },
+    {
+      "epoch": 2.7499686363066114,
+      "grad_norm": 0.5730034177518993,
+      "learning_rate": 5e-06,
+      "loss": 0.6559,
+      "step": 2740
+    },
+    {
+      "epoch": 2.760005018190942,
+      "grad_norm": 0.5413799739509437,
+      "learning_rate": 5e-06,
+      "loss": 0.6536,
+      "step": 2750
+    },
+    {
+      "epoch": 2.770041400075273,
+      "grad_norm": 0.58185222058302,
+      "learning_rate": 5e-06,
+      "loss": 0.6607,
+      "step": 2760
+    },
+    {
+      "epoch": 2.7800777819596036,
+      "grad_norm": 0.5659358615987367,
+      "learning_rate": 5e-06,
+      "loss": 0.6532,
+      "step": 2770
+    },
+    {
+      "epoch": 2.7901141638439344,
+      "grad_norm": 0.5269963486852614,
+      "learning_rate": 5e-06,
+      "loss": 0.6516,
+      "step": 2780
+    },
+    {
+      "epoch": 2.8001505457282647,
+      "grad_norm": 0.6056482620803397,
+      "learning_rate": 5e-06,
+      "loss": 0.6552,
+      "step": 2790
+    },
+    {
+      "epoch": 2.8101869276125955,
+      "grad_norm": 1.0590493469402826,
+      "learning_rate": 5e-06,
+      "loss": 0.6556,
+      "step": 2800
+    },
+    {
+      "epoch": 2.8202233094969262,
+      "grad_norm": 0.8284099367079102,
+      "learning_rate": 5e-06,
+      "loss": 0.6549,
+      "step": 2810
+    },
+    {
+      "epoch": 2.830259691381257,
+      "grad_norm": 0.6501574839000807,
+      "learning_rate": 5e-06,
+      "loss": 0.6507,
+      "step": 2820
+    },
+    {
+      "epoch": 2.8402960732655878,
+      "grad_norm": 0.5410870388612636,
+      "learning_rate": 5e-06,
+      "loss": 0.6526,
+      "step": 2830
+    },
+    {
+      "epoch": 2.8503324551499185,
+      "grad_norm": 0.6510662758185398,
+      "learning_rate": 5e-06,
+      "loss": 0.6519,
+      "step": 2840
+    },
+    {
+      "epoch": 2.8603688370342493,
+      "grad_norm": 0.5238204299865007,
+      "learning_rate": 5e-06,
+      "loss": 0.6593,
+      "step": 2850
+    },
+    {
+      "epoch": 2.87040521891858,
+      "grad_norm": 0.5397683523247623,
+      "learning_rate": 5e-06,
+      "loss": 0.6526,
+      "step": 2860
+    },
+    {
+      "epoch": 2.880441600802911,
+      "grad_norm": 0.5558371306126749,
+      "learning_rate": 5e-06,
+      "loss": 0.6521,
+      "step": 2870
+    },
+    {
+      "epoch": 2.890477982687241,
+      "grad_norm": 0.5084123704706736,
+      "learning_rate": 5e-06,
+      "loss": 0.655,
+      "step": 2880
+    },
+    {
+      "epoch": 2.900514364571572,
+      "grad_norm": 0.5363806596935408,
+      "learning_rate": 5e-06,
+      "loss": 0.6523,
+      "step": 2890
+    },
+    {
+      "epoch": 2.9105507464559026,
+      "grad_norm": 0.526536425381128,
+      "learning_rate": 5e-06,
+      "loss": 0.6549,
+      "step": 2900
+    },
+    {
+      "epoch": 2.9205871283402334,
+      "grad_norm": 0.5720711163344511,
+      "learning_rate": 5e-06,
+      "loss": 0.6516,
+      "step": 2910
+    },
+    {
+      "epoch": 2.930623510224564,
+      "grad_norm": 0.5168344329750222,
+      "learning_rate": 5e-06,
+      "loss": 0.6505,
+      "step": 2920
+    },
+    {
+      "epoch": 2.940659892108895,
+      "grad_norm": 0.5068041805158231,
+      "learning_rate": 5e-06,
+      "loss": 0.6523,
+      "step": 2930
+    },
+    {
+      "epoch": 2.950696273993225,
+      "grad_norm": 0.5854150052744146,
+      "learning_rate": 5e-06,
+      "loss": 0.6562,
+      "step": 2940
+    },
+    {
+      "epoch": 2.960732655877556,
+      "grad_norm": 0.6196831772444017,
+      "learning_rate": 5e-06,
+      "loss": 0.6542,
+      "step": 2950
+    },
+    {
+      "epoch": 2.9707690377618867,
+      "grad_norm": 0.5314563776407271,
+      "learning_rate": 5e-06,
+      "loss": 0.6561,
+      "step": 2960
+    },
+    {
+      "epoch": 2.9808054196462175,
+      "grad_norm": 0.5039099479336567,
+      "learning_rate": 5e-06,
+      "loss": 0.6563,
+      "step": 2970
+    },
+    {
+      "epoch": 2.9908418015305482,
+      "grad_norm": 0.5313007683420622,
+      "learning_rate": 5e-06,
+      "loss": 0.6521,
+      "step": 2980
+    },
+    {
+      "epoch": 2.999874545226446,
+      "eval_loss": 0.7146658897399902,
+      "eval_runtime": 674.3767,
+      "eval_samples_per_second": 39.813,
+      "eval_steps_per_second": 0.623,
+      "step": 2989
+    },
+    {
+      "epoch": 3.000878183414879,
+      "grad_norm": 1.159167907299322,
+      "learning_rate": 5e-06,
+      "loss": 0.6898,
+      "step": 2990
+    },
+    {
+      "epoch": 3.0109145652992098,
+      "grad_norm": 0.7818114221168871,
+      "learning_rate": 5e-06,
+      "loss": 0.6112,
+      "step": 3000
+    },
+    {
+      "epoch": 3.0209509471835405,
+      "grad_norm": 0.7106384774672879,
+      "learning_rate": 5e-06,
+      "loss": 0.6059,
+      "step": 3010
+    },
+    {
+      "epoch": 3.030987329067871,
+      "grad_norm": 0.640003891116449,
+      "learning_rate": 5e-06,
+      "loss": 0.6085,
+      "step": 3020
+    },
+    {
+      "epoch": 3.0410237109522016,
+      "grad_norm": 0.6066326657235696,
+      "learning_rate": 5e-06,
+      "loss": 0.6077,
+      "step": 3030
+    },
+    {
+      "epoch": 3.0510600928365323,
+      "grad_norm": 0.6071064742090702,
+      "learning_rate": 5e-06,
+      "loss": 0.6116,
+      "step": 3040
+    },
+    {
+      "epoch": 3.061096474720863,
+      "grad_norm": 0.693281501487692,
+      "learning_rate": 5e-06,
+      "loss": 0.6052,
+      "step": 3050
+    },
+    {
+      "epoch": 3.071132856605194,
+      "grad_norm": 0.7693025911684378,
+      "learning_rate": 5e-06,
+      "loss": 0.6095,
+      "step": 3060
+    },
+    {
+      "epoch": 3.0811692384895246,
+      "grad_norm": 0.6184427233437038,
+      "learning_rate": 5e-06,
+      "loss": 0.6069,
+      "step": 3070
+    },
+    {
+      "epoch": 3.0912056203738554,
+      "grad_norm": 0.634870226376911,
+      "learning_rate": 5e-06,
+      "loss": 0.6099,
+      "step": 3080
+    },
+    {
+      "epoch": 3.101242002258186,
+      "grad_norm": 0.5947920487741215,
+      "learning_rate": 5e-06,
+      "loss": 0.6092,
+      "step": 3090
+    },
+    {
+      "epoch": 3.1112783841425165,
+      "grad_norm": 0.5475506951891964,
+      "learning_rate": 5e-06,
+      "loss": 0.6066,
+      "step": 3100
+    },
+    {
+      "epoch": 3.121314766026847,
+      "grad_norm": 0.5786846254856872,
+      "learning_rate": 5e-06,
+      "loss": 0.6077,
+      "step": 3110
+    },
+    {
+      "epoch": 3.131351147911178,
+      "grad_norm": 0.5837921428316006,
+      "learning_rate": 5e-06,
+      "loss": 0.6125,
+      "step": 3120
+    },
+    {
+      "epoch": 3.1413875297955087,
+      "grad_norm": 0.6388660075166559,
+      "learning_rate": 5e-06,
+      "loss": 0.6073,
+      "step": 3130
+    },
+    {
+      "epoch": 3.1514239116798395,
+      "grad_norm": 0.6247319700614546,
+      "learning_rate": 5e-06,
+      "loss": 0.6129,
+      "step": 3140
+    },
+    {
+      "epoch": 3.1614602935641702,
+      "grad_norm": 0.7220969862146115,
+      "learning_rate": 5e-06,
+      "loss": 0.6096,
+      "step": 3150
+    },
+    {
+      "epoch": 3.171496675448501,
+      "grad_norm": 0.5966143252277392,
+      "learning_rate": 5e-06,
+      "loss": 0.6108,
+      "step": 3160
+    },
+    {
+      "epoch": 3.1815330573328313,
+      "grad_norm": 0.5806668148525886,
+      "learning_rate": 5e-06,
+      "loss": 0.611,
+      "step": 3170
+    },
+    {
+      "epoch": 3.191569439217162,
+      "grad_norm": 0.5847564401984537,
+      "learning_rate": 5e-06,
+      "loss": 0.6099,
+      "step": 3180
+    },
+    {
+      "epoch": 3.201605821101493,
+      "grad_norm": 0.5685073324759383,
+      "learning_rate": 5e-06,
+      "loss": 0.6162,
+      "step": 3190
+    },
+    {
+      "epoch": 3.2116422029858236,
+      "grad_norm": 0.5806892343391038,
+      "learning_rate": 5e-06,
+      "loss": 0.6099,
+      "step": 3200
+    },
+    {
+      "epoch": 3.2216785848701543,
+      "grad_norm": 0.5629335787755336,
+      "learning_rate": 5e-06,
+      "loss": 0.6122,
+      "step": 3210
+    },
+    {
+      "epoch": 3.231714966754485,
+      "grad_norm": 0.6104998235017857,
+      "learning_rate": 5e-06,
+      "loss": 0.6128,
+      "step": 3220
+    },
+    {
+      "epoch": 3.241751348638816,
+      "grad_norm": 0.670576007712542,
+      "learning_rate": 5e-06,
+      "loss": 0.6122,
+      "step": 3230
+    },
+    {
+      "epoch": 3.251787730523146,
+      "grad_norm": 0.6597487550561909,
+      "learning_rate": 5e-06,
+      "loss": 0.6135,
+      "step": 3240
+    },
+    {
+      "epoch": 3.261824112407477,
+      "grad_norm": 0.5645378989833628,
+      "learning_rate": 5e-06,
+      "loss": 0.6149,
+      "step": 3250
+    },
+    {
+      "epoch": 3.2718604942918077,
+      "grad_norm": 0.5939861646065504,
+      "learning_rate": 5e-06,
+      "loss": 0.6097,
+      "step": 3260
+    },
+    {
+      "epoch": 3.2818968761761385,
+      "grad_norm": 0.6160406690736504,
+      "learning_rate": 5e-06,
+      "loss": 0.6086,
+      "step": 3270
+    },
+    {
+      "epoch": 3.291933258060469,
+      "grad_norm": 0.5487825374465094,
+      "learning_rate": 5e-06,
+      "loss": 0.6144,
+      "step": 3280
+    },
+    {
+      "epoch": 3.3019696399448,
+      "grad_norm": 0.6520181865316601,
+      "learning_rate": 5e-06,
+      "loss": 0.6183,
+      "step": 3290
+    },
+    {
+      "epoch": 3.3120060218291307,
+      "grad_norm": 0.5977506986605584,
+      "learning_rate": 5e-06,
+      "loss": 0.6147,
+      "step": 3300
+    },
+    {
+      "epoch": 3.3220424037134615,
+      "grad_norm": 0.6484133892242163,
+      "learning_rate": 5e-06,
+      "loss": 0.6151,
+      "step": 3310
+    },
+    {
+      "epoch": 3.332078785597792,
+      "grad_norm": 0.5970543245993525,
+      "learning_rate": 5e-06,
+      "loss": 0.6108,
+      "step": 3320
+    },
+    {
+      "epoch": 3.3421151674821226,
+      "grad_norm": 0.6116862845869632,
+      "learning_rate": 5e-06,
+      "loss": 0.6166,
+      "step": 3330
+    },
+    {
+      "epoch": 3.3521515493664533,
+      "grad_norm": 0.5580458755366267,
+      "learning_rate": 5e-06,
+      "loss": 0.6177,
+      "step": 3340
+    },
+    {
+      "epoch": 3.362187931250784,
+      "grad_norm": 0.7040073547476862,
+      "learning_rate": 5e-06,
+      "loss": 0.6181,
+      "step": 3350
+    },
+    {
+      "epoch": 3.372224313135115,
+      "grad_norm": 0.652807816857214,
+      "learning_rate": 5e-06,
+      "loss": 0.6186,
+      "step": 3360
+    },
+    {
+      "epoch": 3.3822606950194456,
+      "grad_norm": 0.6106384378999347,
+      "learning_rate": 5e-06,
+      "loss": 0.6189,
+      "step": 3370
+    },
+    {
+      "epoch": 3.3922970769037764,
+      "grad_norm": 0.6920160744092827,
+      "learning_rate": 5e-06,
+      "loss": 0.617,
+      "step": 3380
+    },
+    {
+      "epoch": 3.4023334587881067,
+      "grad_norm": 0.5814853765567533,
+      "learning_rate": 5e-06,
+      "loss": 0.6118,
+      "step": 3390
+    },
+    {
+      "epoch": 3.4123698406724374,
+      "grad_norm": 0.5389887117603109,
+      "learning_rate": 5e-06,
+      "loss": 0.6126,
+      "step": 3400
+    },
+    {
+      "epoch": 3.422406222556768,
+      "grad_norm": 0.582316325799389,
+      "learning_rate": 5e-06,
+      "loss": 0.6143,
+      "step": 3410
+    },
+    {
+      "epoch": 3.432442604441099,
+      "grad_norm": 0.5612761289810537,
+      "learning_rate": 5e-06,
+      "loss": 0.616,
+      "step": 3420
+    },
+    {
+      "epoch": 3.4424789863254297,
+      "grad_norm": 0.5315307027152637,
+      "learning_rate": 5e-06,
+      "loss": 0.6126,
+      "step": 3430
+    },
+    {
+      "epoch": 3.4525153682097605,
+      "grad_norm": 0.5632095541748001,
+      "learning_rate": 5e-06,
+      "loss": 0.6139,
+      "step": 3440
+    },
+    {
+      "epoch": 3.462551750094091,
+      "grad_norm": 0.5777253428377956,
+      "learning_rate": 5e-06,
+      "loss": 0.6153,
+      "step": 3450
+    },
+    {
+      "epoch": 3.472588131978422,
+      "grad_norm": 0.5802066633079221,
+      "learning_rate": 5e-06,
+      "loss": 0.6183,
+      "step": 3460
+    },
+    {
+      "epoch": 3.4826245138627523,
+      "grad_norm": 0.6858510179050318,
+      "learning_rate": 5e-06,
+      "loss": 0.6105,
+      "step": 3470
+    },
+    {
+      "epoch": 3.492660895747083,
+      "grad_norm": 0.6150923461042579,
+      "learning_rate": 5e-06,
+      "loss": 0.6123,
+      "step": 3480
+    },
+    {
+      "epoch": 3.502697277631414,
+      "grad_norm": 0.6850358509214438,
+      "learning_rate": 5e-06,
+      "loss": 0.6176,
+      "step": 3490
+    },
+    {
+      "epoch": 3.5127336595157446,
+      "grad_norm": 0.6221194287714066,
+      "learning_rate": 5e-06,
+      "loss": 0.6135,
+      "step": 3500
+    },
+    {
+      "epoch": 3.5227700414000753,
+      "grad_norm": 0.6337555357747637,
+      "learning_rate": 5e-06,
+      "loss": 0.6176,
+      "step": 3510
+    },
+    {
+      "epoch": 3.532806423284406,
+      "grad_norm": 0.5696342404252304,
+      "learning_rate": 5e-06,
+      "loss": 0.6194,
+      "step": 3520
+    },
+    {
+      "epoch": 3.5428428051687364,
+      "grad_norm": 0.5192096724412292,
+      "learning_rate": 5e-06,
+      "loss": 0.6169,
+      "step": 3530
+    },
+    {
+      "epoch": 3.552879187053067,
+      "grad_norm": 0.6461636488212382,
+      "learning_rate": 5e-06,
+      "loss": 0.6194,
+      "step": 3540
+    },
+    {
+      "epoch": 3.562915568937398,
+      "grad_norm": 0.5204792269879596,
+      "learning_rate": 5e-06,
+      "loss": 0.6134,
+      "step": 3550
+    },
+    {
+      "epoch": 3.5729519508217287,
+      "grad_norm": 0.5799708864875738,
+      "learning_rate": 5e-06,
+      "loss": 0.6188,
+      "step": 3560
+    },
+    {
+      "epoch": 3.5829883327060594,
+      "grad_norm": 0.5463250823773549,
+      "learning_rate": 5e-06,
+      "loss": 0.6176,
+      "step": 3570
+    },
+    {
+      "epoch": 3.59302471459039,
+      "grad_norm": 0.6314712032266755,
+      "learning_rate": 5e-06,
+      "loss": 0.6207,
+      "step": 3580
+    },
+    {
+      "epoch": 3.603061096474721,
+      "grad_norm": 0.6479660409480549,
+      "learning_rate": 5e-06,
+      "loss": 0.6164,
+      "step": 3590
+    },
+    {
+      "epoch": 3.6130974783590517,
+      "grad_norm": 0.6300826657668005,
+      "learning_rate": 5e-06,
+      "loss": 0.6161,
+      "step": 3600
+    },
+    {
+      "epoch": 3.6231338602433825,
+      "grad_norm": 0.6595382686169196,
+      "learning_rate": 5e-06,
+      "loss": 0.6206,
+      "step": 3610
+    },
+    {
+      "epoch": 3.6331702421277132,
+      "grad_norm": 0.5652565303110992,
+      "learning_rate": 5e-06,
+      "loss": 0.6196,
+      "step": 3620
+    },
+    {
+      "epoch": 3.6432066240120435,
+      "grad_norm": 0.579203582642288,
+      "learning_rate": 5e-06,
+      "loss": 0.6204,
+      "step": 3630
+    },
+    {
+      "epoch": 3.6532430058963743,
+      "grad_norm": 0.5437849615714947,
+      "learning_rate": 5e-06,
+      "loss": 0.6183,
+      "step": 3640
+    },
+    {
+      "epoch": 3.663279387780705,
+      "grad_norm": 0.5787593937881179,
+      "learning_rate": 5e-06,
+      "loss": 0.6167,
+      "step": 3650
+    },
+    {
+      "epoch": 3.673315769665036,
+      "grad_norm": 0.5811897003512283,
+      "learning_rate": 5e-06,
+      "loss": 0.6152,
+      "step": 3660
+    },
+    {
+      "epoch": 3.6833521515493666,
+      "grad_norm": 0.5178334072146914,
+      "learning_rate": 5e-06,
+      "loss": 0.612,
+      "step": 3670
+    },
+    {
+      "epoch": 3.693388533433697,
+      "grad_norm": 0.6269969593175169,
+      "learning_rate": 5e-06,
+      "loss": 0.616,
+      "step": 3680
+    },
+    {
+      "epoch": 3.7034249153180276,
+      "grad_norm": 0.6398231596326978,
+      "learning_rate": 5e-06,
+      "loss": 0.62,
+      "step": 3690
+    },
+    {
+      "epoch": 3.7134612972023584,
+      "grad_norm": 0.5787553408157999,
+      "learning_rate": 5e-06,
+      "loss": 0.6199,
+      "step": 3700
+    },
+    {
+      "epoch": 3.723497679086689,
+      "grad_norm": 0.5592510334692263,
+      "learning_rate": 5e-06,
+      "loss": 0.6172,
+      "step": 3710
+    },
+    {
+      "epoch": 3.73353406097102,
+      "grad_norm": 0.5911520225126639,
+      "learning_rate": 5e-06,
+      "loss": 0.6206,
+      "step": 3720
+    },
+    {
+      "epoch": 3.7435704428553507,
+      "grad_norm": 0.5509577144961972,
+      "learning_rate": 5e-06,
+      "loss": 0.6162,
+      "step": 3730
+    },
+    {
+      "epoch": 3.7536068247396814,
+      "grad_norm": 0.6230171740270747,
+      "learning_rate": 5e-06,
+      "loss": 0.6211,
+      "step": 3740
+    },
+    {
+      "epoch": 3.763643206624012,
+      "grad_norm": 0.606064062911815,
+      "learning_rate": 5e-06,
+      "loss": 0.6169,
+      "step": 3750
+    },
+    {
+      "epoch": 3.773679588508343,
+      "grad_norm": 0.5251378729385787,
+      "learning_rate": 5e-06,
+      "loss": 0.6142,
+      "step": 3760
+    },
+    {
+      "epoch": 3.7837159703926737,
+      "grad_norm": 0.5313902122829314,
+      "learning_rate": 5e-06,
+      "loss": 0.6171,
+      "step": 3770
+    },
+    {
+      "epoch": 3.793752352277004,
+      "grad_norm": 0.5738855269066635,
+      "learning_rate": 5e-06,
+      "loss": 0.614,
+      "step": 3780
+    },
+    {
+      "epoch": 3.803788734161335,
+      "grad_norm": 0.6094017649137001,
+      "learning_rate": 5e-06,
+      "loss": 0.6184,
+      "step": 3790
+    },
+    {
+      "epoch": 3.8138251160456655,
+      "grad_norm": 0.577775104243207,
+      "learning_rate": 5e-06,
+      "loss": 0.6147,
+      "step": 3800
+    },
+    {
+      "epoch": 3.8238614979299963,
+      "grad_norm": 0.5896473350091869,
+      "learning_rate": 5e-06,
+      "loss": 0.6191,
+      "step": 3810
+    },
+    {
+      "epoch": 3.833897879814327,
+      "grad_norm": 0.6197381864983481,
+      "learning_rate": 5e-06,
+      "loss": 0.6204,
+      "step": 3820
+    },
+    {
+      "epoch": 3.8439342616986574,
+      "grad_norm": 0.6385235404417998,
+      "learning_rate": 5e-06,
+      "loss": 0.6184,
+      "step": 3830
+    },
+    {
+      "epoch": 3.853970643582988,
+      "grad_norm": 0.5605836986977404,
+      "learning_rate": 5e-06,
+      "loss": 0.6184,
+      "step": 3840
+    },
+    {
+      "epoch": 3.864007025467319,
+      "grad_norm": 0.5314662217673221,
+      "learning_rate": 5e-06,
+      "loss": 0.6175,
+      "step": 3850
+    },
+    {
+      "epoch": 3.8740434073516496,
+      "grad_norm": 0.5617746957894754,
+      "learning_rate": 5e-06,
+      "loss": 0.6168,
+      "step": 3860
+    },
+    {
+      "epoch": 3.8840797892359804,
+      "grad_norm": 0.6130682994037887,
+      "learning_rate": 5e-06,
+      "loss": 0.6213,
+      "step": 3870
+    },
+    {
+      "epoch": 3.894116171120311,
+      "grad_norm": 0.6330660373549564,
+      "learning_rate": 5e-06,
+      "loss": 0.6237,
+      "step": 3880
+    },
+    {
+      "epoch": 3.904152553004642,
+      "grad_norm": 0.5757814490358608,
+      "learning_rate": 5e-06,
+      "loss": 0.6199,
+      "step": 3890
+    },
+    {
+      "epoch": 3.9141889348889727,
+      "grad_norm": 0.5686860241059948,
+      "learning_rate": 5e-06,
+      "loss": 0.6166,
+      "step": 3900
+    },
+    {
+      "epoch": 3.9242253167733034,
+      "grad_norm": 0.577591496190582,
+      "learning_rate": 5e-06,
+      "loss": 0.6194,
+      "step": 3910
+    },
+    {
+      "epoch": 3.934261698657634,
+      "grad_norm": 0.5589041470451204,
+      "learning_rate": 5e-06,
+      "loss": 0.619,
+      "step": 3920
+    },
+    {
+      "epoch": 3.9442980805419645,
+      "grad_norm": 0.6004802459840047,
+      "learning_rate": 5e-06,
+      "loss": 0.6148,
+      "step": 3930
+    },
+    {
+      "epoch": 3.9543344624262953,
+      "grad_norm": 0.6153349703188992,
+      "learning_rate": 5e-06,
+      "loss": 0.6166,
+      "step": 3940
+    },
+    {
+      "epoch": 3.964370844310626,
+      "grad_norm": 0.5218178205884076,
+      "learning_rate": 5e-06,
+      "loss": 0.619,
+      "step": 3950
+    },
+    {
+      "epoch": 3.974407226194957,
+      "grad_norm": 0.547168472398349,
+      "learning_rate": 5e-06,
+      "loss": 0.6194,
+      "step": 3960
+    },
+    {
+      "epoch": 3.9844436080792875,
+      "grad_norm": 0.5359836200059497,
+      "learning_rate": 5e-06,
+      "loss": 0.6225,
+      "step": 3970
+    },
+    {
+      "epoch": 3.994479989963618,
+      "grad_norm": 0.7180200697231374,
+      "learning_rate": 5e-06,
+      "loss": 0.6204,
+      "step": 3980
+    },
+    {
+      "epoch": 3.99849454271735,
+      "eval_loss": 0.7222821116447449,
+      "eval_runtime": 679.219,
+      "eval_samples_per_second": 39.529,
+      "eval_steps_per_second": 0.618,
+      "step": 3984
+    },
+    {
+      "epoch": 3.99849454271735,
+      "step": 3984,
+      "total_flos": 6673139006177280.0,
+      "train_loss": 0.6817261522194468,
+      "train_runtime": 130344.1738,
+      "train_samples_per_second": 15.654,
+      "train_steps_per_second": 0.031
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3984,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6673139006177280.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed