CocoRoF commited on
Commit
ea43299
·
verified ·
1 Parent(s): d7d8249

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "x2bee/KoModernBERT-base-mlm-v02-ckp01",
3
  "architectures": [
4
  "ModernBertForMaskedLM"
5
  ],
 
1
  {
2
+ "_name_or_path": "x2bee/KoModernBERT-base-mlm-v02-ckp02",
3
  "architectures": [
4
  "ModernBertForMaskedLM"
5
  ],
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e44ca357aee33f0c4d77f6283b1039050b3b7ab63acb733b0943c04c3dcd383d
3
  size 368988278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b42fde2d931440aeb105a7aa23055e1283fb71687fa5d3f2553594d5b2bda4d
3
  size 368988278
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:baf2ed4278033136ed85db98ede9a42075dac27c8de0866c7d2f2fdc4114aff8
3
  size 1107079290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69b75254b559a304e81391980e81b803443819ae3cc5363a1be04e6ff7aeac9a
3
  size 1107079290
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69bcb5a0833345d8383f742fc9fcb902b5c0cae43ce63aee4b910dcd39dcda65
3
- size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78d3f197f6c6558fa8056324f1563ab9e957255f5a1a959362aa4eed7a9545db
3
+ size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fae392ec6232cbf9da21d6ed12bc8247d0d24e7f3a3606acd23be00f3e8bbfc5
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c1a9c65c2869356282cad6b4a0f7dff7f4dd68ab3d9d216c72b7d6cb524f860
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbf3e7ca9991a58b0b16574a3c653483c551c270aa05aba06c162ea593f7b0f2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:896febe768e17bae5022a95960c041f6425783774ec8859d99d3b149063b1bf9
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c695bebf6bcb75cbe26378bfe0ab7e2a33c49f713b9d6e4d10632b24322977e7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eac482d57e966585467c8ef44dae2869bf7e5d92886f69c11ed7bccc34c07efe
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5ebb13c71265c5464c9aa9bb9b66f07764d73befe6cd63a2aaf8e781bf0a374
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1f27d227a20dc320ac283e0938fb2f6e5b475829a583f8c44d1a16a8c828307
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12cc6e245e189be568c8dfd43a4dd8f04bb3dbd9f17f41458107935d2c2a6a9d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d05a7106aaeaec4b81704e3f4a998b5123cf9342a6733bd9fd2d578e99108c3b
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36086646e9a8f76fea69f8a227112e83bb63524964ccdfb82f4cdad88b90e5e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b94120d8d88502ec8d8b623ec7550315caca003b44fcffbb5767ab0de91baefe
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b44153bacf860d0ca6ce4c6b9380a199feab8a72ca613e6745bfb671b02c4e4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:332e4d901be380f740b5d8578f7b80ef1865c7fba83bc288c8a35852205cc668
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:403f43b20707fbdd1c781cdcb70c570b1debcb96cf3e00b550211ddfce6ea77f
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e7caefe5b3c11741d0fb75575bef604da14e2f80b3dca75692f7742d5d3ab3f
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,1426 +1,724 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 3000,
6
- "global_step": 9931,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0050347397039573055,
13
- "grad_norm": 28.6875,
14
- "learning_rate": 7.866954072722124e-10,
15
- "loss": 25.5771,
16
  "step": 50
17
  },
18
  {
19
- "epoch": 0.010069479407914611,
20
- "grad_norm": 25.671875,
21
- "learning_rate": 1.5733908145444248e-09,
22
- "loss": 25.4662,
23
  "step": 100
24
  },
25
  {
26
- "epoch": 0.015104219111871917,
27
- "grad_norm": 22.171875,
28
- "learning_rate": 2.360086221816637e-09,
29
- "loss": 25.2219,
30
  "step": 150
31
  },
32
  {
33
- "epoch": 0.020138958815829222,
34
- "grad_norm": 24.40625,
35
- "learning_rate": 3.1467816290888496e-09,
36
- "loss": 24.9302,
37
  "step": 200
38
  },
39
  {
40
- "epoch": 0.025173698519786528,
41
- "grad_norm": 24.296875,
42
- "learning_rate": 3.933477036361062e-09,
43
- "loss": 24.7105,
44
  "step": 250
45
  },
46
  {
47
- "epoch": 0.030208438223743833,
48
- "grad_norm": 23.890625,
49
- "learning_rate": 4.720172443633274e-09,
50
- "loss": 24.403,
51
  "step": 300
52
  },
53
  {
54
- "epoch": 0.03524317792770114,
55
- "grad_norm": 23.125,
56
- "learning_rate": 5.506867850905486e-09,
57
- "loss": 24.2281,
58
  "step": 350
59
  },
60
  {
61
- "epoch": 0.040277917631658444,
62
- "grad_norm": 23.828125,
63
- "learning_rate": 6.293563258177699e-09,
64
- "loss": 24.0123,
65
  "step": 400
66
  },
67
  {
68
- "epoch": 0.04531265733561575,
69
- "grad_norm": 26.859375,
70
- "learning_rate": 7.080258665449911e-09,
71
- "loss": 23.7867,
72
  "step": 450
73
  },
74
  {
75
- "epoch": 0.050347397039573055,
76
- "grad_norm": 23.78125,
77
- "learning_rate": 7.866954072722123e-09,
78
- "loss": 23.7771,
79
  "step": 500
80
  },
81
  {
82
- "epoch": 0.05538213674353036,
83
- "grad_norm": 23.625,
84
- "learning_rate": 8.653649479994334e-09,
85
- "loss": 23.6351,
86
  "step": 550
87
  },
88
  {
89
- "epoch": 0.060416876447487666,
90
- "grad_norm": 21.953125,
91
- "learning_rate": 9.440344887266548e-09,
92
- "loss": 23.5309,
93
  "step": 600
94
  },
95
  {
96
- "epoch": 0.06545161615144497,
97
- "grad_norm": 21.921875,
98
- "learning_rate": 1.0227040294538761e-08,
99
- "loss": 23.405,
100
  "step": 650
101
  },
102
  {
103
- "epoch": 0.07048635585540228,
104
- "grad_norm": 21.265625,
105
- "learning_rate": 1.1013735701810972e-08,
106
- "loss": 23.2201,
107
  "step": 700
108
  },
109
  {
110
- "epoch": 0.07552109555935958,
111
- "grad_norm": 23.890625,
112
- "learning_rate": 1.1800431109083184e-08,
113
- "loss": 23.2961,
114
  "step": 750
115
  },
116
  {
117
- "epoch": 0.08055583526331689,
118
- "grad_norm": 20.859375,
119
- "learning_rate": 1.2587126516355398e-08,
120
- "loss": 23.3068,
121
  "step": 800
122
  },
123
  {
124
- "epoch": 0.0855905749672742,
125
- "grad_norm": 21.234375,
126
- "learning_rate": 1.3373821923627609e-08,
127
- "loss": 23.0151,
128
  "step": 850
129
  },
130
  {
131
- "epoch": 0.0906253146712315,
132
- "grad_norm": 21.859375,
133
- "learning_rate": 1.4160517330899822e-08,
134
- "loss": 23.0405,
135
  "step": 900
136
  },
137
  {
138
- "epoch": 0.0956600543751888,
139
- "grad_norm": 25.3125,
140
- "learning_rate": 1.4947212738172034e-08,
141
- "loss": 22.9393,
142
  "step": 950
143
  },
144
  {
145
- "epoch": 0.10069479407914611,
146
- "grad_norm": 23.65625,
147
- "learning_rate": 1.5733908145444247e-08,
148
- "loss": 22.8635,
149
  "step": 1000
150
  },
151
  {
152
- "epoch": 0.10572953378310342,
153
- "grad_norm": 21.609375,
154
- "learning_rate": 1.6520603552716456e-08,
155
- "loss": 22.8879,
156
  "step": 1050
157
  },
158
  {
159
- "epoch": 0.11076427348706072,
160
- "grad_norm": 22.671875,
161
- "learning_rate": 1.730729895998867e-08,
162
- "loss": 22.8493,
163
  "step": 1100
164
  },
165
  {
166
- "epoch": 0.11579901319101803,
167
- "grad_norm": 20.625,
168
- "learning_rate": 1.8093994367260884e-08,
169
- "loss": 22.8345,
170
  "step": 1150
171
  },
172
  {
173
- "epoch": 0.12083375289497533,
174
- "grad_norm": 24.859375,
175
- "learning_rate": 1.8880689774533097e-08,
176
- "loss": 22.698,
177
  "step": 1200
178
  },
179
  {
180
- "epoch": 0.12586849259893262,
181
- "grad_norm": 21.734375,
182
- "learning_rate": 1.966738518180531e-08,
183
- "loss": 22.5749,
184
  "step": 1250
185
  },
186
  {
187
- "epoch": 0.13090323230288994,
188
- "grad_norm": 20.328125,
189
- "learning_rate": 2.0454080589077522e-08,
190
- "loss": 22.501,
191
  "step": 1300
192
  },
193
  {
194
- "epoch": 0.13593797200684724,
195
- "grad_norm": 18.21875,
196
- "learning_rate": 2.124077599634973e-08,
197
- "loss": 22.5212,
198
  "step": 1350
199
  },
200
  {
201
- "epoch": 0.14097271171080455,
202
- "grad_norm": 21.71875,
203
- "learning_rate": 2.2027471403621943e-08,
204
- "loss": 22.382,
205
  "step": 1400
206
  },
207
  {
208
- "epoch": 0.14600745141476185,
209
  "grad_norm": 21.15625,
210
- "learning_rate": 2.2814166810894156e-08,
211
- "loss": 22.3512,
212
  "step": 1450
213
  },
214
  {
215
- "epoch": 0.15104219111871917,
216
- "grad_norm": 21.234375,
217
- "learning_rate": 2.360086221816637e-08,
218
- "loss": 22.4326,
219
  "step": 1500
220
  },
221
  {
222
- "epoch": 0.15607693082267646,
223
- "grad_norm": 23.34375,
224
- "learning_rate": 2.438755762543858e-08,
225
- "loss": 22.3669,
226
  "step": 1550
227
  },
228
  {
229
- "epoch": 0.16111167052663378,
230
- "grad_norm": 27.390625,
231
- "learning_rate": 2.5174253032710797e-08,
232
- "loss": 22.1896,
233
  "step": 1600
234
  },
235
  {
236
- "epoch": 0.16614641023059107,
237
- "grad_norm": 23.703125,
238
- "learning_rate": 2.5960948439983006e-08,
239
- "loss": 22.1401,
240
  "step": 1650
241
  },
242
  {
243
- "epoch": 0.1711811499345484,
244
- "grad_norm": 21.3125,
245
- "learning_rate": 2.6747643847255218e-08,
246
- "loss": 22.0221,
247
  "step": 1700
248
  },
249
  {
250
- "epoch": 0.17621588963850568,
251
- "grad_norm": 23.59375,
252
- "learning_rate": 2.753433925452743e-08,
253
- "loss": 22.1581,
254
  "step": 1750
255
  },
256
  {
257
- "epoch": 0.181250629342463,
258
- "grad_norm": 22.96875,
259
- "learning_rate": 2.8321034661799643e-08,
260
- "loss": 22.0211,
261
  "step": 1800
262
  },
263
  {
264
- "epoch": 0.1862853690464203,
265
- "grad_norm": 21.34375,
266
- "learning_rate": 2.9107730069071856e-08,
267
- "loss": 22.0172,
268
  "step": 1850
269
  },
270
  {
271
- "epoch": 0.1913201087503776,
272
- "grad_norm": 21.5625,
273
- "learning_rate": 2.989442547634407e-08,
274
- "loss": 21.9346,
275
  "step": 1900
276
  },
277
  {
278
- "epoch": 0.1963548484543349,
279
- "grad_norm": 23.4375,
280
- "learning_rate": 3.068112088361628e-08,
281
- "loss": 22.0178,
282
  "step": 1950
283
  },
284
  {
285
- "epoch": 0.20138958815829222,
286
- "grad_norm": 19.234375,
287
- "learning_rate": 3.1467816290888493e-08,
288
- "loss": 21.8406,
289
  "step": 2000
290
  },
291
  {
292
- "epoch": 0.2064243278622495,
293
- "grad_norm": 23.53125,
294
- "learning_rate": 3.2254511698160706e-08,
295
- "loss": 21.8947,
296
  "step": 2050
297
  },
298
  {
299
- "epoch": 0.21145906756620683,
300
- "grad_norm": 25.53125,
301
- "learning_rate": 3.304120710543291e-08,
302
- "loss": 21.847,
303
  "step": 2100
304
  },
305
  {
306
- "epoch": 0.21649380727016412,
307
- "grad_norm": 22.34375,
308
- "learning_rate": 3.3827902512705124e-08,
309
- "loss": 21.7869,
310
  "step": 2150
311
  },
312
  {
313
- "epoch": 0.22152854697412144,
314
- "grad_norm": 20.8125,
315
- "learning_rate": 3.461459791997734e-08,
316
- "loss": 21.7937,
317
  "step": 2200
318
  },
319
  {
320
- "epoch": 0.22656328667807873,
321
- "grad_norm": 24.140625,
322
- "learning_rate": 3.5401293327249556e-08,
323
- "loss": 21.6699,
324
  "step": 2250
325
  },
326
  {
327
- "epoch": 0.23159802638203605,
328
- "grad_norm": 21.09375,
329
- "learning_rate": 3.618798873452177e-08,
330
- "loss": 21.7536,
331
  "step": 2300
332
  },
333
  {
334
- "epoch": 0.23663276608599335,
335
- "grad_norm": 30.21875,
336
- "learning_rate": 3.697468414179398e-08,
337
- "loss": 21.5712,
338
  "step": 2350
339
  },
340
  {
341
- "epoch": 0.24166750578995067,
342
- "grad_norm": 19.5625,
343
- "learning_rate": 3.7761379549066193e-08,
344
- "loss": 21.6019,
345
  "step": 2400
346
  },
347
  {
348
- "epoch": 0.24670224549390796,
349
- "grad_norm": 20.234375,
350
- "learning_rate": 3.8548074956338406e-08,
351
- "loss": 21.4812,
352
  "step": 2450
353
  },
354
  {
355
- "epoch": 0.25173698519786525,
356
- "grad_norm": 24.671875,
357
- "learning_rate": 3.933477036361062e-08,
358
- "loss": 21.3988,
359
  "step": 2500
360
  },
361
  {
362
- "epoch": 0.2567717249018226,
363
- "grad_norm": 21.53125,
364
- "learning_rate": 4.012146577088283e-08,
365
- "loss": 21.335,
366
  "step": 2550
367
  },
368
  {
369
- "epoch": 0.2618064646057799,
370
- "grad_norm": 21.046875,
371
- "learning_rate": 4.0908161178155043e-08,
372
- "loss": 21.4497,
373
  "step": 2600
374
  },
375
  {
376
- "epoch": 0.2668412043097372,
377
- "grad_norm": 21.140625,
378
- "learning_rate": 4.1694856585427256e-08,
379
- "loss": 21.3434,
380
  "step": 2650
381
  },
382
  {
383
- "epoch": 0.27187594401369447,
384
- "grad_norm": 21.8125,
385
- "learning_rate": 4.248155199269946e-08,
386
- "loss": 21.3044,
387
  "step": 2700
388
  },
389
  {
390
- "epoch": 0.2769106837176518,
391
- "grad_norm": 20.75,
392
- "learning_rate": 4.3268247399971674e-08,
393
- "loss": 21.3128,
394
  "step": 2750
395
  },
396
  {
397
- "epoch": 0.2819454234216091,
398
- "grad_norm": 21.890625,
399
- "learning_rate": 4.405494280724389e-08,
400
- "loss": 21.3736,
401
  "step": 2800
402
  },
403
  {
404
- "epoch": 0.2869801631255664,
405
- "grad_norm": 24.9375,
406
- "learning_rate": 4.48416382145161e-08,
407
- "loss": 21.2371,
408
  "step": 2850
409
  },
410
  {
411
- "epoch": 0.2920149028295237,
412
- "grad_norm": 21.75,
413
- "learning_rate": 4.562833362178831e-08,
414
- "loss": 21.2297,
415
  "step": 2900
416
  },
417
  {
418
- "epoch": 0.29704964253348104,
419
- "grad_norm": 23.015625,
420
- "learning_rate": 4.6415029029060524e-08,
421
- "loss": 21.1849,
422
  "step": 2950
423
  },
424
  {
425
- "epoch": 0.30208438223743833,
426
- "grad_norm": 35.15625,
427
- "learning_rate": 4.720172443633274e-08,
428
- "loss": 21.1206,
429
  "step": 3000
430
  },
431
  {
432
- "epoch": 0.30208438223743833,
433
- "eval_loss": 2.6425445079803467,
434
- "eval_runtime": 96.7996,
435
- "eval_samples_per_second": 2764.609,
436
- "eval_steps_per_second": 43.203,
437
  "step": 3000
438
  },
439
  {
440
- "epoch": 0.3071191219413956,
441
- "grad_norm": 23.75,
442
- "learning_rate": 4.798841984360495e-08,
443
- "loss": 21.0509,
444
  "step": 3050
445
  },
446
  {
447
- "epoch": 0.3121538616453529,
448
- "grad_norm": 22.0,
449
- "learning_rate": 4.877511525087716e-08,
450
- "loss": 21.0524,
451
  "step": 3100
452
  },
453
  {
454
- "epoch": 0.31718860134931026,
455
- "grad_norm": 23.375,
456
- "learning_rate": 4.956181065814938e-08,
457
- "loss": 21.0385,
458
  "step": 3150
459
  },
460
  {
461
- "epoch": 0.32222334105326755,
462
- "grad_norm": 26.25,
463
- "learning_rate": 5.034850606542159e-08,
464
- "loss": 21.0142,
465
  "step": 3200
466
  },
467
  {
468
- "epoch": 0.32725808075722485,
469
- "grad_norm": 23.25,
470
- "learning_rate": 5.11352014726938e-08,
471
- "loss": 21.0465,
472
  "step": 3250
473
  },
474
  {
475
- "epoch": 0.33229282046118214,
476
- "grad_norm": 23.953125,
477
- "learning_rate": 5.192189687996601e-08,
478
- "loss": 20.8655,
479
  "step": 3300
480
  },
481
  {
482
- "epoch": 0.3373275601651395,
483
- "grad_norm": 21.890625,
484
- "learning_rate": 5.2708592287238224e-08,
485
- "loss": 20.9073,
486
  "step": 3350
487
  },
488
  {
489
- "epoch": 0.3423622998690968,
490
- "grad_norm": 20.890625,
491
- "learning_rate": 5.3495287694510437e-08,
492
- "loss": 20.9968,
493
  "step": 3400
494
  },
495
  {
496
- "epoch": 0.34739703957305407,
497
- "grad_norm": 22.859375,
498
- "learning_rate": 5.428198310178265e-08,
499
- "loss": 20.8051,
500
  "step": 3450
501
  },
502
  {
503
- "epoch": 0.35243177927701136,
504
- "grad_norm": 21.109375,
505
- "learning_rate": 5.506867850905486e-08,
506
- "loss": 20.8349,
507
  "step": 3500
508
  },
509
  {
510
- "epoch": 0.3574665189809687,
511
- "grad_norm": 22.46875,
512
- "learning_rate": 5.5855373916327074e-08,
513
- "loss": 20.8667,
514
  "step": 3550
515
  },
516
  {
517
- "epoch": 0.362501258684926,
518
- "grad_norm": 23.015625,
519
- "learning_rate": 5.6642069323599287e-08,
520
- "loss": 20.7498,
521
  "step": 3600
522
  },
523
  {
524
- "epoch": 0.3675359983888833,
525
- "grad_norm": 21.171875,
526
- "learning_rate": 5.74287647308715e-08,
527
- "loss": 20.7326,
528
  "step": 3650
529
  },
530
  {
531
- "epoch": 0.3725707380928406,
532
- "grad_norm": 22.578125,
533
- "learning_rate": 5.821546013814371e-08,
534
- "loss": 20.6889,
535
  "step": 3700
536
  },
537
  {
538
- "epoch": 0.37760547779679793,
539
- "grad_norm": 22.25,
540
- "learning_rate": 5.9002155545415924e-08,
541
- "loss": 20.8748,
542
  "step": 3750
543
  },
544
  {
545
- "epoch": 0.3826402175007552,
546
- "grad_norm": 22.96875,
547
- "learning_rate": 5.978885095268814e-08,
548
- "loss": 20.6818,
549
  "step": 3800
550
  },
551
  {
552
- "epoch": 0.3876749572047125,
553
- "grad_norm": 25.625,
554
- "learning_rate": 6.057554635996034e-08,
555
- "loss": 20.6082,
556
  "step": 3850
557
  },
558
  {
559
- "epoch": 0.3927096969086698,
560
- "grad_norm": 22.0,
561
- "learning_rate": 6.136224176723256e-08,
562
- "loss": 20.5934,
563
  "step": 3900
564
  },
565
  {
566
- "epoch": 0.39774443661262715,
567
- "grad_norm": 20.5625,
568
- "learning_rate": 6.214893717450477e-08,
569
- "loss": 20.6491,
570
  "step": 3950
571
  },
572
  {
573
- "epoch": 0.40277917631658444,
574
- "grad_norm": 23.625,
575
- "learning_rate": 6.293563258177699e-08,
576
- "loss": 20.6252,
577
  "step": 4000
578
  },
579
  {
580
- "epoch": 0.40781391602054173,
581
- "grad_norm": 26.953125,
582
- "learning_rate": 6.372232798904919e-08,
583
- "loss": 20.5366,
584
  "step": 4050
585
  },
586
  {
587
- "epoch": 0.412848655724499,
588
- "grad_norm": 22.15625,
589
- "learning_rate": 6.450902339632141e-08,
590
- "loss": 20.5188,
591
  "step": 4100
592
  },
593
  {
594
- "epoch": 0.4178833954284564,
595
- "grad_norm": 21.421875,
596
- "learning_rate": 6.529571880359362e-08,
597
- "loss": 20.4495,
598
  "step": 4150
599
  },
600
  {
601
- "epoch": 0.42291813513241366,
602
- "grad_norm": 25.09375,
603
- "learning_rate": 6.608241421086582e-08,
604
- "loss": 20.5249,
605
  "step": 4200
606
  },
607
  {
608
- "epoch": 0.42795287483637096,
609
- "grad_norm": 25.375,
610
- "learning_rate": 6.686910961813804e-08,
611
- "loss": 20.386,
612
  "step": 4250
613
  },
614
  {
615
- "epoch": 0.43298761454032825,
616
- "grad_norm": 22.078125,
617
- "learning_rate": 6.765580502541025e-08,
618
- "loss": 20.2751,
619
  "step": 4300
620
  },
621
  {
622
- "epoch": 0.4380223542442856,
623
- "grad_norm": 25.8125,
624
- "learning_rate": 6.844250043268247e-08,
625
- "loss": 20.2817,
626
  "step": 4350
627
  },
628
  {
629
- "epoch": 0.4430570939482429,
630
- "grad_norm": 22.75,
631
- "learning_rate": 6.922919583995467e-08,
632
- "loss": 20.3044,
633
  "step": 4400
634
  },
635
  {
636
- "epoch": 0.4480918336522002,
637
- "grad_norm": 23.28125,
638
- "learning_rate": 7.00158912472269e-08,
639
- "loss": 20.2628,
640
  "step": 4450
641
  },
642
  {
643
- "epoch": 0.45312657335615747,
644
- "grad_norm": 22.96875,
645
- "learning_rate": 7.080258665449911e-08,
646
- "loss": 20.3045,
647
  "step": 4500
648
  },
649
  {
650
- "epoch": 0.4581613130601148,
651
- "grad_norm": 23.796875,
652
- "learning_rate": 7.158928206177133e-08,
653
- "loss": 20.2352,
654
  "step": 4550
655
  },
656
  {
657
- "epoch": 0.4631960527640721,
658
- "grad_norm": 25.265625,
659
- "learning_rate": 7.237597746904354e-08,
660
- "loss": 20.2659,
661
  "step": 4600
662
  },
663
  {
664
- "epoch": 0.4682307924680294,
665
- "grad_norm": 24.578125,
666
- "learning_rate": 7.316267287631576e-08,
667
- "loss": 20.1321,
668
  "step": 4650
669
  },
670
  {
671
- "epoch": 0.4732655321719867,
672
- "grad_norm": 25.90625,
673
- "learning_rate": 7.394936828358796e-08,
674
- "loss": 20.0429,
675
  "step": 4700
676
  },
677
  {
678
- "epoch": 0.47830027187594404,
679
- "grad_norm": 21.71875,
680
- "learning_rate": 7.473606369086017e-08,
681
- "loss": 20.1437,
682
  "step": 4750
683
  },
684
  {
685
- "epoch": 0.48333501157990133,
686
- "grad_norm": 23.046875,
687
- "learning_rate": 7.552275909813239e-08,
688
- "loss": 20.2228,
689
  "step": 4800
690
  },
691
  {
692
- "epoch": 0.4883697512838586,
693
- "grad_norm": 21.1875,
694
- "learning_rate": 7.630945450540459e-08,
695
- "loss": 20.1252,
696
  "step": 4850
697
  },
698
  {
699
- "epoch": 0.4934044909878159,
700
- "grad_norm": 21.890625,
701
- "learning_rate": 7.709614991267681e-08,
702
- "loss": 20.0736,
703
  "step": 4900
704
  },
705
  {
706
- "epoch": 0.49843923069177326,
707
- "grad_norm": 20.65625,
708
- "learning_rate": 7.788284531994902e-08,
709
- "loss": 20.0553,
710
  "step": 4950
711
  },
712
  {
713
- "epoch": 0.5034739703957305,
714
- "grad_norm": 21.5,
715
- "learning_rate": 7.866954072722124e-08,
716
- "loss": 20.0629,
717
  "step": 5000
718
- },
719
- {
720
- "epoch": 0.5085087100996879,
721
- "grad_norm": 22.390625,
722
- "learning_rate": 7.945623613449344e-08,
723
- "loss": 20.0789,
724
- "step": 5050
725
- },
726
- {
727
- "epoch": 0.5135434498036452,
728
- "grad_norm": 24.96875,
729
- "learning_rate": 8.024293154176566e-08,
730
- "loss": 19.9527,
731
- "step": 5100
732
- },
733
- {
734
- "epoch": 0.5185781895076025,
735
- "grad_norm": 32.1875,
736
- "learning_rate": 8.102962694903787e-08,
737
- "loss": 20.0024,
738
- "step": 5150
739
- },
740
- {
741
- "epoch": 0.5236129292115598,
742
- "grad_norm": 28.796875,
743
- "learning_rate": 8.181632235631009e-08,
744
- "loss": 19.9463,
745
- "step": 5200
746
- },
747
- {
748
- "epoch": 0.5286476689155171,
749
- "grad_norm": 22.859375,
750
- "learning_rate": 8.260301776358229e-08,
751
- "loss": 19.9707,
752
- "step": 5250
753
- },
754
- {
755
- "epoch": 0.5336824086194744,
756
- "grad_norm": 23.46875,
757
- "learning_rate": 8.338971317085451e-08,
758
- "loss": 19.8411,
759
- "step": 5300
760
- },
761
- {
762
- "epoch": 0.5387171483234316,
763
- "grad_norm": 25.03125,
764
- "learning_rate": 8.417640857812672e-08,
765
- "loss": 19.9652,
766
- "step": 5350
767
- },
768
- {
769
- "epoch": 0.5437518880273889,
770
- "grad_norm": 21.65625,
771
- "learning_rate": 8.496310398539892e-08,
772
- "loss": 19.7365,
773
- "step": 5400
774
- },
775
- {
776
- "epoch": 0.5487866277313463,
777
- "grad_norm": 21.5625,
778
- "learning_rate": 8.574979939267114e-08,
779
- "loss": 19.7931,
780
- "step": 5450
781
- },
782
- {
783
- "epoch": 0.5538213674353036,
784
- "grad_norm": 21.484375,
785
- "learning_rate": 8.653649479994335e-08,
786
- "loss": 19.7417,
787
- "step": 5500
788
- },
789
- {
790
- "epoch": 0.5588561071392609,
791
- "grad_norm": 23.546875,
792
- "learning_rate": 8.732319020721557e-08,
793
- "loss": 19.6592,
794
- "step": 5550
795
- },
796
- {
797
- "epoch": 0.5638908468432182,
798
- "grad_norm": 21.78125,
799
- "learning_rate": 8.810988561448777e-08,
800
- "loss": 19.6256,
801
- "step": 5600
802
- },
803
- {
804
- "epoch": 0.5689255865471755,
805
- "grad_norm": 24.6875,
806
- "learning_rate": 8.889658102175999e-08,
807
- "loss": 19.7998,
808
- "step": 5650
809
- },
810
- {
811
- "epoch": 0.5739603262511328,
812
- "grad_norm": 33.0625,
813
- "learning_rate": 8.96832764290322e-08,
814
- "loss": 19.6983,
815
- "step": 5700
816
- },
817
- {
818
- "epoch": 0.5789950659550901,
819
- "grad_norm": 23.03125,
820
- "learning_rate": 9.046997183630442e-08,
821
- "loss": 19.7832,
822
- "step": 5750
823
- },
824
- {
825
- "epoch": 0.5840298056590474,
826
- "grad_norm": 24.796875,
827
- "learning_rate": 9.125666724357662e-08,
828
- "loss": 19.7117,
829
- "step": 5800
830
- },
831
- {
832
- "epoch": 0.5890645453630048,
833
- "grad_norm": 23.109375,
834
- "learning_rate": 9.204336265084884e-08,
835
- "loss": 19.726,
836
- "step": 5850
837
- },
838
- {
839
- "epoch": 0.5940992850669621,
840
- "grad_norm": 22.484375,
841
- "learning_rate": 9.283005805812105e-08,
842
- "loss": 19.6984,
843
- "step": 5900
844
- },
845
- {
846
- "epoch": 0.5991340247709194,
847
- "grad_norm": 21.53125,
848
- "learning_rate": 9.361675346539325e-08,
849
- "loss": 19.7171,
850
- "step": 5950
851
- },
852
- {
853
- "epoch": 0.6041687644748767,
854
- "grad_norm": 23.296875,
855
- "learning_rate": 9.440344887266547e-08,
856
- "loss": 19.5465,
857
- "step": 6000
858
- },
859
- {
860
- "epoch": 0.6041687644748767,
861
- "eval_loss": 2.448728084564209,
862
- "eval_runtime": 97.8245,
863
- "eval_samples_per_second": 2735.644,
864
- "eval_steps_per_second": 42.75,
865
- "step": 6000
866
- },
867
- {
868
- "epoch": 0.609203504178834,
869
- "grad_norm": 24.5625,
870
- "learning_rate": 9.519014427993768e-08,
871
- "loss": 19.5921,
872
- "step": 6050
873
- },
874
- {
875
- "epoch": 0.6142382438827912,
876
- "grad_norm": 23.28125,
877
- "learning_rate": 9.59768396872099e-08,
878
- "loss": 19.5556,
879
- "step": 6100
880
- },
881
- {
882
- "epoch": 0.6192729835867485,
883
- "grad_norm": 23.671875,
884
- "learning_rate": 9.67635350944821e-08,
885
- "loss": 19.6202,
886
- "step": 6150
887
- },
888
- {
889
- "epoch": 0.6243077232907058,
890
- "grad_norm": 26.609375,
891
- "learning_rate": 9.755023050175432e-08,
892
- "loss": 19.5439,
893
- "step": 6200
894
- },
895
- {
896
- "epoch": 0.6293424629946631,
897
- "grad_norm": 26.3125,
898
- "learning_rate": 9.833692590902653e-08,
899
- "loss": 19.4785,
900
- "step": 6250
901
- },
902
- {
903
- "epoch": 0.6343772026986205,
904
- "grad_norm": 24.9375,
905
- "learning_rate": 9.912362131629876e-08,
906
- "loss": 19.4382,
907
- "step": 6300
908
- },
909
- {
910
- "epoch": 0.6394119424025778,
911
- "grad_norm": 25.703125,
912
- "learning_rate": 9.991031672357097e-08,
913
- "loss": 19.4496,
914
- "step": 6350
915
- },
916
- {
917
- "epoch": 0.6444466821065351,
918
- "grad_norm": 23.046875,
919
- "learning_rate": 1.0069701213084319e-07,
920
- "loss": 19.4539,
921
- "step": 6400
922
- },
923
- {
924
- "epoch": 0.6494814218104924,
925
- "grad_norm": 23.203125,
926
- "learning_rate": 1.0148370753811539e-07,
927
- "loss": 19.4393,
928
- "step": 6450
929
- },
930
- {
931
- "epoch": 0.6545161615144497,
932
- "grad_norm": 26.09375,
933
- "learning_rate": 1.022704029453876e-07,
934
- "loss": 19.3709,
935
- "step": 6500
936
- },
937
- {
938
- "epoch": 0.659550901218407,
939
- "grad_norm": 23.46875,
940
- "learning_rate": 1.0305709835265982e-07,
941
- "loss": 19.4344,
942
- "step": 6550
943
- },
944
- {
945
- "epoch": 0.6645856409223643,
946
- "grad_norm": 24.84375,
947
- "learning_rate": 1.0384379375993202e-07,
948
- "loss": 19.2581,
949
- "step": 6600
950
- },
951
- {
952
- "epoch": 0.6696203806263216,
953
- "grad_norm": 24.0,
954
- "learning_rate": 1.0463048916720424e-07,
955
- "loss": 19.3535,
956
- "step": 6650
957
- },
958
- {
959
- "epoch": 0.674655120330279,
960
- "grad_norm": 23.890625,
961
- "learning_rate": 1.0541718457447645e-07,
962
- "loss": 19.1846,
963
- "step": 6700
964
- },
965
- {
966
- "epoch": 0.6796898600342363,
967
- "grad_norm": 22.875,
968
- "learning_rate": 1.0620387998174867e-07,
969
- "loss": 19.234,
970
- "step": 6750
971
- },
972
- {
973
- "epoch": 0.6847245997381936,
974
- "grad_norm": 23.65625,
975
- "learning_rate": 1.0699057538902087e-07,
976
- "loss": 19.1321,
977
- "step": 6800
978
- },
979
- {
980
- "epoch": 0.6897593394421508,
981
- "grad_norm": 21.328125,
982
- "learning_rate": 1.0777727079629309e-07,
983
- "loss": 19.3041,
984
- "step": 6850
985
- },
986
- {
987
- "epoch": 0.6947940791461081,
988
- "grad_norm": 30.234375,
989
- "learning_rate": 1.085639662035653e-07,
990
- "loss": 19.2101,
991
- "step": 6900
992
- },
993
- {
994
- "epoch": 0.6998288188500654,
995
- "grad_norm": 22.609375,
996
- "learning_rate": 1.0935066161083752e-07,
997
- "loss": 19.12,
998
- "step": 6950
999
- },
1000
- {
1001
- "epoch": 0.7048635585540227,
1002
- "grad_norm": 22.921875,
1003
- "learning_rate": 1.1013735701810972e-07,
1004
- "loss": 19.211,
1005
- "step": 7000
1006
- },
1007
- {
1008
- "epoch": 0.70989829825798,
1009
- "grad_norm": 27.5,
1010
- "learning_rate": 1.1092405242538194e-07,
1011
- "loss": 19.0585,
1012
- "step": 7050
1013
- },
1014
- {
1015
- "epoch": 0.7149330379619374,
1016
- "grad_norm": 26.515625,
1017
- "learning_rate": 1.1171074783265415e-07,
1018
- "loss": 19.2151,
1019
- "step": 7100
1020
- },
1021
- {
1022
- "epoch": 0.7199677776658947,
1023
- "grad_norm": 22.734375,
1024
- "learning_rate": 1.1249744323992635e-07,
1025
- "loss": 19.0144,
1026
- "step": 7150
1027
- },
1028
- {
1029
- "epoch": 0.725002517369852,
1030
- "grad_norm": 23.359375,
1031
- "learning_rate": 1.1328413864719857e-07,
1032
- "loss": 19.1019,
1033
- "step": 7200
1034
- },
1035
- {
1036
- "epoch": 0.7300372570738093,
1037
- "grad_norm": 28.09375,
1038
- "learning_rate": 1.1407083405447078e-07,
1039
- "loss": 19.1398,
1040
- "step": 7250
1041
- },
1042
- {
1043
- "epoch": 0.7350719967777666,
1044
- "grad_norm": 24.09375,
1045
- "learning_rate": 1.14857529461743e-07,
1046
- "loss": 19.1944,
1047
- "step": 7300
1048
- },
1049
- {
1050
- "epoch": 0.7401067364817239,
1051
- "grad_norm": 22.625,
1052
- "learning_rate": 1.156442248690152e-07,
1053
- "loss": 19.0951,
1054
- "step": 7350
1055
- },
1056
- {
1057
- "epoch": 0.7451414761856812,
1058
- "grad_norm": 24.8125,
1059
- "learning_rate": 1.1643092027628742e-07,
1060
- "loss": 19.0727,
1061
- "step": 7400
1062
- },
1063
- {
1064
- "epoch": 0.7501762158896385,
1065
- "grad_norm": 24.375,
1066
- "learning_rate": 1.1721761568355963e-07,
1067
- "loss": 19.1014,
1068
- "step": 7450
1069
- },
1070
- {
1071
- "epoch": 0.7552109555935959,
1072
- "grad_norm": 22.4375,
1073
- "learning_rate": 1.1800431109083185e-07,
1074
- "loss": 18.9976,
1075
- "step": 7500
1076
- },
1077
- {
1078
- "epoch": 0.7602456952975531,
1079
- "grad_norm": 23.359375,
1080
- "learning_rate": 1.1879100649810405e-07,
1081
- "loss": 19.0317,
1082
- "step": 7550
1083
- },
1084
- {
1085
- "epoch": 0.7652804350015104,
1086
- "grad_norm": 22.59375,
1087
- "learning_rate": 1.1957770190537627e-07,
1088
- "loss": 19.0108,
1089
- "step": 7600
1090
- },
1091
- {
1092
- "epoch": 0.7703151747054677,
1093
- "grad_norm": 28.8125,
1094
- "learning_rate": 1.2036439731264848e-07,
1095
- "loss": 18.9483,
1096
- "step": 7650
1097
- },
1098
- {
1099
- "epoch": 0.775349914409425,
1100
- "grad_norm": 25.171875,
1101
- "learning_rate": 1.2115109271992069e-07,
1102
- "loss": 19.0207,
1103
- "step": 7700
1104
- },
1105
- {
1106
- "epoch": 0.7803846541133823,
1107
- "grad_norm": 24.015625,
1108
- "learning_rate": 1.2193778812719292e-07,
1109
- "loss": 18.9183,
1110
- "step": 7750
1111
- },
1112
- {
1113
- "epoch": 0.7854193938173396,
1114
- "grad_norm": 21.953125,
1115
- "learning_rate": 1.2272448353446512e-07,
1116
- "loss": 18.8943,
1117
- "step": 7800
1118
- },
1119
- {
1120
- "epoch": 0.7904541335212969,
1121
- "grad_norm": 23.9375,
1122
- "learning_rate": 1.2351117894173733e-07,
1123
- "loss": 18.877,
1124
- "step": 7850
1125
- },
1126
- {
1127
- "epoch": 0.7954888732252543,
1128
- "grad_norm": 23.796875,
1129
- "learning_rate": 1.2429787434900954e-07,
1130
- "loss": 18.7945,
1131
- "step": 7900
1132
- },
1133
- {
1134
- "epoch": 0.8005236129292116,
1135
- "grad_norm": 23.484375,
1136
- "learning_rate": 1.2508456975628177e-07,
1137
- "loss": 18.8354,
1138
- "step": 7950
1139
- },
1140
- {
1141
- "epoch": 0.8055583526331689,
1142
- "grad_norm": 23.84375,
1143
- "learning_rate": 1.2587126516355397e-07,
1144
- "loss": 18.9123,
1145
- "step": 8000
1146
- },
1147
- {
1148
- "epoch": 0.8105930923371262,
1149
- "grad_norm": 22.8125,
1150
- "learning_rate": 1.2665796057082618e-07,
1151
- "loss": 18.7323,
1152
- "step": 8050
1153
- },
1154
- {
1155
- "epoch": 0.8156278320410835,
1156
- "grad_norm": 22.4375,
1157
- "learning_rate": 1.2744465597809839e-07,
1158
- "loss": 18.7036,
1159
- "step": 8100
1160
- },
1161
- {
1162
- "epoch": 0.8206625717450408,
1163
- "grad_norm": 23.953125,
1164
- "learning_rate": 1.2823135138537062e-07,
1165
- "loss": 18.7942,
1166
- "step": 8150
1167
- },
1168
- {
1169
- "epoch": 0.825697311448998,
1170
- "grad_norm": 23.03125,
1171
- "learning_rate": 1.2901804679264282e-07,
1172
- "loss": 18.7461,
1173
- "step": 8200
1174
- },
1175
- {
1176
- "epoch": 0.8307320511529553,
1177
- "grad_norm": 24.8125,
1178
- "learning_rate": 1.2980474219991503e-07,
1179
- "loss": 18.8245,
1180
- "step": 8250
1181
- },
1182
- {
1183
- "epoch": 0.8357667908569127,
1184
- "grad_norm": 22.453125,
1185
- "learning_rate": 1.3059143760718724e-07,
1186
- "loss": 18.7245,
1187
- "step": 8300
1188
- },
1189
- {
1190
- "epoch": 0.84080153056087,
1191
- "grad_norm": 23.609375,
1192
- "learning_rate": 1.3137813301445947e-07,
1193
- "loss": 18.7654,
1194
- "step": 8350
1195
- },
1196
- {
1197
- "epoch": 0.8458362702648273,
1198
- "grad_norm": 22.078125,
1199
- "learning_rate": 1.3216482842173165e-07,
1200
- "loss": 18.6009,
1201
- "step": 8400
1202
- },
1203
- {
1204
- "epoch": 0.8508710099687846,
1205
- "grad_norm": 23.015625,
1206
- "learning_rate": 1.3295152382900388e-07,
1207
- "loss": 18.7458,
1208
- "step": 8450
1209
- },
1210
- {
1211
- "epoch": 0.8559057496727419,
1212
- "grad_norm": 24.578125,
1213
- "learning_rate": 1.3373821923627609e-07,
1214
- "loss": 18.7133,
1215
- "step": 8500
1216
- },
1217
- {
1218
- "epoch": 0.8609404893766992,
1219
- "grad_norm": 20.953125,
1220
- "learning_rate": 1.3452491464354832e-07,
1221
- "loss": 18.6203,
1222
- "step": 8550
1223
- },
1224
- {
1225
- "epoch": 0.8659752290806565,
1226
- "grad_norm": 28.0,
1227
- "learning_rate": 1.353116100508205e-07,
1228
- "loss": 18.6754,
1229
- "step": 8600
1230
- },
1231
- {
1232
- "epoch": 0.8710099687846138,
1233
- "grad_norm": 23.390625,
1234
- "learning_rate": 1.3609830545809273e-07,
1235
- "loss": 18.5753,
1236
- "step": 8650
1237
- },
1238
- {
1239
- "epoch": 0.8760447084885712,
1240
- "grad_norm": 21.296875,
1241
- "learning_rate": 1.3688500086536494e-07,
1242
- "loss": 18.5705,
1243
- "step": 8700
1244
- },
1245
- {
1246
- "epoch": 0.8810794481925285,
1247
- "grad_norm": 22.875,
1248
- "learning_rate": 1.3767169627263717e-07,
1249
- "loss": 18.5268,
1250
- "step": 8750
1251
- },
1252
- {
1253
- "epoch": 0.8861141878964858,
1254
- "grad_norm": 25.90625,
1255
- "learning_rate": 1.3845839167990935e-07,
1256
- "loss": 18.594,
1257
- "step": 8800
1258
- },
1259
- {
1260
- "epoch": 0.8911489276004431,
1261
- "grad_norm": 24.171875,
1262
- "learning_rate": 1.3924508708718158e-07,
1263
- "loss": 18.4483,
1264
- "step": 8850
1265
- },
1266
- {
1267
- "epoch": 0.8961836673044004,
1268
- "grad_norm": 24.59375,
1269
- "learning_rate": 1.400317824944538e-07,
1270
- "loss": 18.548,
1271
- "step": 8900
1272
- },
1273
- {
1274
- "epoch": 0.9012184070083576,
1275
- "grad_norm": 23.34375,
1276
- "learning_rate": 1.40818477901726e-07,
1277
- "loss": 18.3815,
1278
- "step": 8950
1279
- },
1280
- {
1281
- "epoch": 0.9062531467123149,
1282
- "grad_norm": 21.953125,
1283
- "learning_rate": 1.4160517330899822e-07,
1284
- "loss": 18.5225,
1285
- "step": 9000
1286
- },
1287
- {
1288
- "epoch": 0.9062531467123149,
1289
- "eval_loss": 2.3123230934143066,
1290
- "eval_runtime": 96.9463,
1291
- "eval_samples_per_second": 2760.426,
1292
- "eval_steps_per_second": 43.137,
1293
- "step": 9000
1294
- },
1295
- {
1296
- "epoch": 0.9112878864162722,
1297
- "grad_norm": 23.21875,
1298
- "learning_rate": 1.4239186871627043e-07,
1299
- "loss": 18.4194,
1300
- "step": 9050
1301
- },
1302
- {
1303
- "epoch": 0.9163226261202296,
1304
- "grad_norm": 22.3125,
1305
- "learning_rate": 1.4317856412354266e-07,
1306
- "loss": 18.4873,
1307
- "step": 9100
1308
- },
1309
- {
1310
- "epoch": 0.9213573658241869,
1311
- "grad_norm": 21.03125,
1312
- "learning_rate": 1.4396525953081484e-07,
1313
- "loss": 18.2763,
1314
- "step": 9150
1315
- },
1316
- {
1317
- "epoch": 0.9263921055281442,
1318
- "grad_norm": 22.40625,
1319
- "learning_rate": 1.4475195493808707e-07,
1320
- "loss": 18.5129,
1321
- "step": 9200
1322
- },
1323
- {
1324
- "epoch": 0.9314268452321015,
1325
- "grad_norm": 20.890625,
1326
- "learning_rate": 1.4553865034535928e-07,
1327
- "loss": 18.3831,
1328
- "step": 9250
1329
- },
1330
- {
1331
- "epoch": 0.9364615849360588,
1332
- "grad_norm": 27.0625,
1333
- "learning_rate": 1.463253457526315e-07,
1334
- "loss": 18.405,
1335
- "step": 9300
1336
- },
1337
- {
1338
- "epoch": 0.9414963246400161,
1339
- "grad_norm": 24.390625,
1340
- "learning_rate": 1.471120411599037e-07,
1341
- "loss": 18.3264,
1342
- "step": 9350
1343
- },
1344
- {
1345
- "epoch": 0.9465310643439734,
1346
- "grad_norm": 21.734375,
1347
- "learning_rate": 1.4789873656717592e-07,
1348
- "loss": 18.3415,
1349
- "step": 9400
1350
- },
1351
- {
1352
- "epoch": 0.9515658040479307,
1353
- "grad_norm": 28.890625,
1354
- "learning_rate": 1.4868543197444813e-07,
1355
- "loss": 18.3986,
1356
- "step": 9450
1357
- },
1358
- {
1359
- "epoch": 0.9566005437518881,
1360
- "grad_norm": 22.859375,
1361
- "learning_rate": 1.4947212738172034e-07,
1362
- "loss": 18.3425,
1363
- "step": 9500
1364
- },
1365
- {
1366
- "epoch": 0.9616352834558454,
1367
- "grad_norm": 24.515625,
1368
- "learning_rate": 1.5025882278899254e-07,
1369
- "loss": 18.4341,
1370
- "step": 9550
1371
- },
1372
- {
1373
- "epoch": 0.9666700231598027,
1374
- "grad_norm": 21.734375,
1375
- "learning_rate": 1.5104551819626477e-07,
1376
- "loss": 18.3468,
1377
- "step": 9600
1378
- },
1379
- {
1380
- "epoch": 0.97170476286376,
1381
- "grad_norm": 21.46875,
1382
- "learning_rate": 1.5183221360353698e-07,
1383
- "loss": 18.2916,
1384
- "step": 9650
1385
- },
1386
- {
1387
- "epoch": 0.9767395025677172,
1388
- "grad_norm": 23.53125,
1389
- "learning_rate": 1.5261890901080919e-07,
1390
- "loss": 18.268,
1391
- "step": 9700
1392
- },
1393
- {
1394
- "epoch": 0.9817742422716745,
1395
- "grad_norm": 24.78125,
1396
- "learning_rate": 1.534056044180814e-07,
1397
- "loss": 18.3326,
1398
- "step": 9750
1399
- },
1400
- {
1401
- "epoch": 0.9868089819756318,
1402
- "grad_norm": 25.328125,
1403
- "learning_rate": 1.5419229982535362e-07,
1404
- "loss": 18.1188,
1405
- "step": 9800
1406
- },
1407
- {
1408
- "epoch": 0.9918437216795891,
1409
- "grad_norm": 23.5625,
1410
- "learning_rate": 1.5497899523262583e-07,
1411
- "loss": 18.2302,
1412
- "step": 9850
1413
- },
1414
- {
1415
- "epoch": 0.9968784613835465,
1416
- "grad_norm": 22.390625,
1417
- "learning_rate": 1.5576569063989804e-07,
1418
- "loss": 18.2507,
1419
- "step": 9900
1420
  }
1421
  ],
1422
  "logging_steps": 50,
1423
- "max_steps": 9931,
1424
  "num_input_tokens_seen": 0,
1425
  "num_train_epochs": 1,
1426
  "save_steps": 5000,
@@ -1431,12 +729,12 @@
1431
  "should_evaluate": false,
1432
  "should_log": false,
1433
  "should_save": true,
1434
- "should_training_stop": true
1435
  },
1436
  "attributes": {}
1437
  }
1438
  },
1439
- "total_flos": 1.7341121073617306e+18,
1440
  "train_batch_size": 8,
1441
  "trial_name": null,
1442
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2996793431028799,
5
  "eval_steps": 3000,
6
+ "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0029967934310287992,
13
+ "grad_norm": 21.09375,
14
+ "learning_rate": 4.682524817381532e-10,
15
+ "loss": 18.3943,
16
  "step": 50
17
  },
18
  {
19
+ "epoch": 0.0059935868620575984,
20
+ "grad_norm": 20.15625,
21
+ "learning_rate": 9.365049634763063e-10,
22
+ "loss": 18.5436,
23
  "step": 100
24
  },
25
  {
26
+ "epoch": 0.008990380293086398,
27
+ "grad_norm": 20.421875,
28
+ "learning_rate": 1.4047574452144595e-09,
29
+ "loss": 18.5259,
30
  "step": 150
31
  },
32
  {
33
+ "epoch": 0.011987173724115197,
34
+ "grad_norm": 21.171875,
35
+ "learning_rate": 1.8730099269526127e-09,
36
+ "loss": 18.374,
37
  "step": 200
38
  },
39
  {
40
+ "epoch": 0.014983967155143996,
41
+ "grad_norm": 20.640625,
42
+ "learning_rate": 2.341262408690766e-09,
43
+ "loss": 18.3854,
44
  "step": 250
45
  },
46
  {
47
+ "epoch": 0.017980760586172796,
48
+ "grad_norm": 20.0625,
49
+ "learning_rate": 2.809514890428919e-09,
50
+ "loss": 18.3343,
51
  "step": 300
52
  },
53
  {
54
+ "epoch": 0.020977554017201593,
55
+ "grad_norm": 18.984375,
56
+ "learning_rate": 3.277767372167072e-09,
57
+ "loss": 18.3591,
58
  "step": 350
59
  },
60
  {
61
+ "epoch": 0.023974347448230394,
62
+ "grad_norm": 21.90625,
63
+ "learning_rate": 3.746019853905225e-09,
64
+ "loss": 18.2317,
65
  "step": 400
66
  },
67
  {
68
+ "epoch": 0.026971140879259194,
69
+ "grad_norm": 19.484375,
70
+ "learning_rate": 4.214272335643379e-09,
71
+ "loss": 18.2008,
72
  "step": 450
73
  },
74
  {
75
+ "epoch": 0.02996793431028799,
76
+ "grad_norm": 21.765625,
77
+ "learning_rate": 4.682524817381532e-09,
78
+ "loss": 18.2127,
79
  "step": 500
80
  },
81
  {
82
+ "epoch": 0.03296472774131679,
83
+ "grad_norm": 19.15625,
84
+ "learning_rate": 5.150777299119685e-09,
85
+ "loss": 18.2734,
86
  "step": 550
87
  },
88
  {
89
+ "epoch": 0.03596152117234559,
90
+ "grad_norm": 20.171875,
91
+ "learning_rate": 5.619029780857838e-09,
92
+ "loss": 18.2318,
93
  "step": 600
94
  },
95
  {
96
+ "epoch": 0.038958314603374386,
97
+ "grad_norm": 20.90625,
98
+ "learning_rate": 6.087282262595992e-09,
99
+ "loss": 18.179,
100
  "step": 650
101
  },
102
  {
103
+ "epoch": 0.04195510803440319,
104
+ "grad_norm": 21.578125,
105
+ "learning_rate": 6.555534744334144e-09,
106
+ "loss": 18.0297,
107
  "step": 700
108
  },
109
  {
110
+ "epoch": 0.04495190146543199,
111
+ "grad_norm": 20.015625,
112
+ "learning_rate": 7.0237872260722975e-09,
113
+ "loss": 18.0406,
114
  "step": 750
115
  },
116
  {
117
+ "epoch": 0.04794869489646079,
118
+ "grad_norm": 22.359375,
119
+ "learning_rate": 7.49203970781045e-09,
120
+ "loss": 18.1646,
121
  "step": 800
122
  },
123
  {
124
+ "epoch": 0.05094548832748959,
125
+ "grad_norm": 19.578125,
126
+ "learning_rate": 7.960292189548605e-09,
127
+ "loss": 18.065,
128
  "step": 850
129
  },
130
  {
131
+ "epoch": 0.05394228175851839,
132
+ "grad_norm": 20.015625,
133
+ "learning_rate": 8.428544671286759e-09,
134
+ "loss": 18.0043,
135
  "step": 900
136
  },
137
  {
138
+ "epoch": 0.05693907518954718,
139
+ "grad_norm": 18.703125,
140
+ "learning_rate": 8.896797153024911e-09,
141
+ "loss": 17.9912,
142
  "step": 950
143
  },
144
  {
145
+ "epoch": 0.05993586862057598,
146
+ "grad_norm": 19.875,
147
+ "learning_rate": 9.365049634763063e-09,
148
+ "loss": 17.9355,
149
  "step": 1000
150
  },
151
  {
152
+ "epoch": 0.06293266205160478,
153
+ "grad_norm": 19.65625,
154
+ "learning_rate": 9.833302116501217e-09,
155
+ "loss": 17.9403,
156
  "step": 1050
157
  },
158
  {
159
+ "epoch": 0.06592945548263358,
160
+ "grad_norm": 20.71875,
161
+ "learning_rate": 1.030155459823937e-08,
162
+ "loss": 18.0746,
163
  "step": 1100
164
  },
165
  {
166
+ "epoch": 0.06892624891366238,
167
+ "grad_norm": 21.1875,
168
+ "learning_rate": 1.0769807079977524e-08,
169
+ "loss": 18.0771,
170
  "step": 1150
171
  },
172
  {
173
+ "epoch": 0.07192304234469118,
174
+ "grad_norm": 20.546875,
175
+ "learning_rate": 1.1238059561715676e-08,
176
+ "loss": 17.9541,
177
  "step": 1200
178
  },
179
  {
180
+ "epoch": 0.07491983577571998,
181
+ "grad_norm": 20.0,
182
+ "learning_rate": 1.170631204345383e-08,
183
+ "loss": 17.9193,
184
  "step": 1250
185
  },
186
  {
187
+ "epoch": 0.07791662920674877,
188
+ "grad_norm": 20.6875,
189
+ "learning_rate": 1.2174564525191984e-08,
190
+ "loss": 17.8958,
191
  "step": 1300
192
  },
193
  {
194
+ "epoch": 0.08091342263777758,
195
+ "grad_norm": 17.984375,
196
+ "learning_rate": 1.2642817006930135e-08,
197
+ "loss": 17.8691,
198
  "step": 1350
199
  },
200
  {
201
+ "epoch": 0.08391021606880637,
202
+ "grad_norm": 19.96875,
203
+ "learning_rate": 1.3111069488668289e-08,
204
+ "loss": 17.8992,
205
  "step": 1400
206
  },
207
  {
208
+ "epoch": 0.08690700949983518,
209
  "grad_norm": 21.15625,
210
+ "learning_rate": 1.3579321970406443e-08,
211
+ "loss": 17.9409,
212
  "step": 1450
213
  },
214
  {
215
+ "epoch": 0.08990380293086397,
216
+ "grad_norm": 17.890625,
217
+ "learning_rate": 1.4047574452144595e-08,
218
+ "loss": 17.9719,
219
  "step": 1500
220
  },
221
  {
222
+ "epoch": 0.09290059636189278,
223
+ "grad_norm": 24.5,
224
+ "learning_rate": 1.4515826933882749e-08,
225
+ "loss": 17.7969,
226
  "step": 1550
227
  },
228
  {
229
+ "epoch": 0.09589738979292158,
230
+ "grad_norm": 21.21875,
231
+ "learning_rate": 1.49840794156209e-08,
232
+ "loss": 17.7747,
233
  "step": 1600
234
  },
235
  {
236
+ "epoch": 0.09889418322395037,
237
+ "grad_norm": 20.9375,
238
+ "learning_rate": 1.5452331897359057e-08,
239
+ "loss": 17.7449,
240
  "step": 1650
241
  },
242
  {
243
+ "epoch": 0.10189097665497918,
244
+ "grad_norm": 21.75,
245
+ "learning_rate": 1.592058437909721e-08,
246
+ "loss": 17.743,
247
  "step": 1700
248
  },
249
  {
250
+ "epoch": 0.10488777008600797,
251
+ "grad_norm": 21.140625,
252
+ "learning_rate": 1.6388836860835362e-08,
253
+ "loss": 17.7607,
254
  "step": 1750
255
  },
256
  {
257
+ "epoch": 0.10788456351703678,
258
+ "grad_norm": 19.546875,
259
+ "learning_rate": 1.6857089342573517e-08,
260
+ "loss": 17.7645,
261
  "step": 1800
262
  },
263
  {
264
+ "epoch": 0.11088135694806557,
265
+ "grad_norm": 18.90625,
266
+ "learning_rate": 1.7325341824311666e-08,
267
+ "loss": 17.7396,
268
  "step": 1850
269
  },
270
  {
271
+ "epoch": 0.11387815037909436,
272
+ "grad_norm": 19.34375,
273
+ "learning_rate": 1.7793594306049822e-08,
274
+ "loss": 17.6369,
275
  "step": 1900
276
  },
277
  {
278
+ "epoch": 0.11687494381012317,
279
+ "grad_norm": 20.140625,
280
+ "learning_rate": 1.8261846787787974e-08,
281
+ "loss": 17.6624,
282
  "step": 1950
283
  },
284
  {
285
+ "epoch": 0.11987173724115197,
286
+ "grad_norm": 23.890625,
287
+ "learning_rate": 1.8730099269526127e-08,
288
+ "loss": 17.6577,
289
  "step": 2000
290
  },
291
  {
292
+ "epoch": 0.12286853067218077,
293
+ "grad_norm": 18.671875,
294
+ "learning_rate": 1.9198351751264282e-08,
295
+ "loss": 17.6518,
296
  "step": 2050
297
  },
298
  {
299
+ "epoch": 0.12586532410320955,
300
+ "grad_norm": 20.015625,
301
+ "learning_rate": 1.9666604233002435e-08,
302
+ "loss": 17.7459,
303
  "step": 2100
304
  },
305
  {
306
+ "epoch": 0.12886211753423837,
307
+ "grad_norm": 20.640625,
308
+ "learning_rate": 2.0134856714740587e-08,
309
+ "loss": 17.7153,
310
  "step": 2150
311
  },
312
  {
313
+ "epoch": 0.13185891096526717,
314
+ "grad_norm": 20.46875,
315
+ "learning_rate": 2.060310919647874e-08,
316
+ "loss": 17.5973,
317
  "step": 2200
318
  },
319
  {
320
+ "epoch": 0.13485570439629596,
321
+ "grad_norm": 20.765625,
322
+ "learning_rate": 2.1071361678216895e-08,
323
+ "loss": 17.6673,
324
  "step": 2250
325
  },
326
  {
327
+ "epoch": 0.13785249782732475,
328
+ "grad_norm": 22.0,
329
+ "learning_rate": 2.1539614159955047e-08,
330
+ "loss": 17.7575,
331
  "step": 2300
332
  },
333
  {
334
+ "epoch": 0.14084929125835355,
335
+ "grad_norm": 19.015625,
336
+ "learning_rate": 2.20078666416932e-08,
337
+ "loss": 17.6427,
338
  "step": 2350
339
  },
340
  {
341
+ "epoch": 0.14384608468938237,
342
+ "grad_norm": 21.484375,
343
+ "learning_rate": 2.2476119123431352e-08,
344
+ "loss": 17.722,
345
  "step": 2400
346
  },
347
  {
348
+ "epoch": 0.14684287812041116,
349
+ "grad_norm": 21.5,
350
+ "learning_rate": 2.2944371605169504e-08,
351
+ "loss": 17.6005,
352
  "step": 2450
353
  },
354
  {
355
+ "epoch": 0.14983967155143996,
356
+ "grad_norm": 19.859375,
357
+ "learning_rate": 2.341262408690766e-08,
358
+ "loss": 17.4574,
359
  "step": 2500
360
  },
361
  {
362
+ "epoch": 0.15283646498246875,
363
+ "grad_norm": 21.03125,
364
+ "learning_rate": 2.3880876568645812e-08,
365
+ "loss": 17.4844,
366
  "step": 2550
367
  },
368
  {
369
+ "epoch": 0.15583325841349754,
370
+ "grad_norm": 19.828125,
371
+ "learning_rate": 2.4349129050383968e-08,
372
+ "loss": 17.5664,
373
  "step": 2600
374
  },
375
  {
376
+ "epoch": 0.15883005184452637,
377
+ "grad_norm": 22.828125,
378
+ "learning_rate": 2.481738153212212e-08,
379
+ "loss": 17.5743,
380
  "step": 2650
381
  },
382
  {
383
+ "epoch": 0.16182684527555516,
384
+ "grad_norm": 20.984375,
385
+ "learning_rate": 2.528563401386027e-08,
386
+ "loss": 17.551,
387
  "step": 2700
388
  },
389
  {
390
+ "epoch": 0.16482363870658395,
391
+ "grad_norm": 21.546875,
392
+ "learning_rate": 2.5753886495598425e-08,
393
+ "loss": 17.5384,
394
  "step": 2750
395
  },
396
  {
397
+ "epoch": 0.16782043213761275,
398
+ "grad_norm": 22.09375,
399
+ "learning_rate": 2.6222138977336577e-08,
400
+ "loss": 17.5279,
401
  "step": 2800
402
  },
403
  {
404
+ "epoch": 0.17081722556864154,
405
+ "grad_norm": 21.953125,
406
+ "learning_rate": 2.6690391459074733e-08,
407
+ "loss": 17.474,
408
  "step": 2850
409
  },
410
  {
411
+ "epoch": 0.17381401899967036,
412
+ "grad_norm": 18.96875,
413
+ "learning_rate": 2.7158643940812885e-08,
414
+ "loss": 17.5219,
415
  "step": 2900
416
  },
417
  {
418
+ "epoch": 0.17681081243069915,
419
+ "grad_norm": 19.375,
420
+ "learning_rate": 2.762689642255104e-08,
421
+ "loss": 17.4719,
422
  "step": 2950
423
  },
424
  {
425
+ "epoch": 0.17980760586172795,
426
+ "grad_norm": 21.40625,
427
+ "learning_rate": 2.809514890428919e-08,
428
+ "loss": 17.4948,
429
  "step": 3000
430
  },
431
  {
432
+ "epoch": 0.17980760586172795,
433
+ "eval_loss": 2.1850366592407227,
434
+ "eval_runtime": 159.4656,
435
+ "eval_samples_per_second": 2819.436,
436
+ "eval_steps_per_second": 44.06,
437
  "step": 3000
438
  },
439
  {
440
+ "epoch": 0.18280439929275674,
441
+ "grad_norm": 24.109375,
442
+ "learning_rate": 2.8563401386027346e-08,
443
+ "loss": 17.4877,
444
  "step": 3050
445
  },
446
  {
447
+ "epoch": 0.18580119272378556,
448
+ "grad_norm": 20.6875,
449
+ "learning_rate": 2.9031653867765498e-08,
450
+ "loss": 17.3119,
451
  "step": 3100
452
  },
453
  {
454
+ "epoch": 0.18879798615481436,
455
+ "grad_norm": 19.578125,
456
+ "learning_rate": 2.949990634950365e-08,
457
+ "loss": 17.4083,
458
  "step": 3150
459
  },
460
  {
461
+ "epoch": 0.19179477958584315,
462
+ "grad_norm": 20.78125,
463
+ "learning_rate": 2.99681588312418e-08,
464
+ "loss": 17.3361,
465
  "step": 3200
466
  },
467
  {
468
+ "epoch": 0.19479157301687194,
469
+ "grad_norm": 19.859375,
470
+ "learning_rate": 3.0436411312979955e-08,
471
+ "loss": 17.4759,
472
  "step": 3250
473
  },
474
  {
475
+ "epoch": 0.19778836644790074,
476
+ "grad_norm": 20.078125,
477
+ "learning_rate": 3.0904663794718114e-08,
478
+ "loss": 17.3626,
479
  "step": 3300
480
  },
481
  {
482
+ "epoch": 0.20078515987892956,
483
+ "grad_norm": 22.421875,
484
+ "learning_rate": 3.1372916276456266e-08,
485
+ "loss": 17.4349,
486
  "step": 3350
487
  },
488
  {
489
+ "epoch": 0.20378195330995835,
490
+ "grad_norm": 22.21875,
491
+ "learning_rate": 3.184116875819442e-08,
492
+ "loss": 17.4686,
493
  "step": 3400
494
  },
495
  {
496
+ "epoch": 0.20677874674098715,
497
+ "grad_norm": 20.453125,
498
+ "learning_rate": 3.230942123993257e-08,
499
+ "loss": 17.3681,
500
  "step": 3450
501
  },
502
  {
503
+ "epoch": 0.20977554017201594,
504
+ "grad_norm": 19.96875,
505
+ "learning_rate": 3.2777673721670723e-08,
506
+ "loss": 17.4929,
507
  "step": 3500
508
  },
509
  {
510
+ "epoch": 0.21277233360304473,
511
+ "grad_norm": 20.25,
512
+ "learning_rate": 3.3245926203408876e-08,
513
+ "loss": 17.3656,
514
  "step": 3550
515
  },
516
  {
517
+ "epoch": 0.21576912703407355,
518
+ "grad_norm": 20.265625,
519
+ "learning_rate": 3.3714178685147035e-08,
520
+ "loss": 17.3414,
521
  "step": 3600
522
  },
523
  {
524
+ "epoch": 0.21876592046510235,
525
+ "grad_norm": 21.015625,
526
+ "learning_rate": 3.418243116688519e-08,
527
+ "loss": 17.3737,
528
  "step": 3650
529
  },
530
  {
531
+ "epoch": 0.22176271389613114,
532
+ "grad_norm": 20.671875,
533
+ "learning_rate": 3.465068364862333e-08,
534
+ "loss": 17.334,
535
  "step": 3700
536
  },
537
  {
538
+ "epoch": 0.22475950732715994,
539
+ "grad_norm": 23.15625,
540
+ "learning_rate": 3.5118936130361485e-08,
541
+ "loss": 17.445,
542
  "step": 3750
543
  },
544
  {
545
+ "epoch": 0.22775630075818873,
546
+ "grad_norm": 22.046875,
547
+ "learning_rate": 3.5587188612099644e-08,
548
+ "loss": 17.3643,
549
  "step": 3800
550
  },
551
  {
552
+ "epoch": 0.23075309418921755,
553
+ "grad_norm": 21.5,
554
+ "learning_rate": 3.6055441093837797e-08,
555
+ "loss": 17.3034,
556
  "step": 3850
557
  },
558
  {
559
+ "epoch": 0.23374988762024634,
560
+ "grad_norm": 20.75,
561
+ "learning_rate": 3.652369357557595e-08,
562
+ "loss": 17.2343,
563
  "step": 3900
564
  },
565
  {
566
+ "epoch": 0.23674668105127514,
567
+ "grad_norm": 22.140625,
568
+ "learning_rate": 3.69919460573141e-08,
569
+ "loss": 17.322,
570
  "step": 3950
571
  },
572
  {
573
+ "epoch": 0.23974347448230393,
574
+ "grad_norm": 21.6875,
575
+ "learning_rate": 3.7460198539052254e-08,
576
+ "loss": 17.2209,
577
  "step": 4000
578
  },
579
  {
580
+ "epoch": 0.24274026791333272,
581
+ "grad_norm": 20.765625,
582
+ "learning_rate": 3.792845102079041e-08,
583
+ "loss": 17.3421,
584
  "step": 4050
585
  },
586
  {
587
+ "epoch": 0.24573706134436155,
588
+ "grad_norm": 19.3125,
589
+ "learning_rate": 3.8396703502528565e-08,
590
+ "loss": 17.1858,
591
  "step": 4100
592
  },
593
  {
594
+ "epoch": 0.24873385477539034,
595
+ "grad_norm": 21.109375,
596
+ "learning_rate": 3.886495598426672e-08,
597
+ "loss": 17.2446,
598
  "step": 4150
599
  },
600
  {
601
+ "epoch": 0.2517306482064191,
602
+ "grad_norm": 24.671875,
603
+ "learning_rate": 3.933320846600487e-08,
604
+ "loss": 17.3155,
605
  "step": 4200
606
  },
607
  {
608
+ "epoch": 0.2547274416374479,
609
+ "grad_norm": 21.921875,
610
+ "learning_rate": 3.9801460947743015e-08,
611
+ "loss": 17.2362,
612
  "step": 4250
613
  },
614
  {
615
+ "epoch": 0.25772423506847675,
616
+ "grad_norm": 22.1875,
617
+ "learning_rate": 4.0269713429481174e-08,
618
+ "loss": 17.1679,
619
  "step": 4300
620
  },
621
  {
622
+ "epoch": 0.2607210284995055,
623
+ "grad_norm": 20.484375,
624
+ "learning_rate": 4.0737965911219327e-08,
625
+ "loss": 17.0951,
626
  "step": 4350
627
  },
628
  {
629
+ "epoch": 0.26371782193053434,
630
+ "grad_norm": 19.15625,
631
+ "learning_rate": 4.120621839295748e-08,
632
+ "loss": 17.167,
633
  "step": 4400
634
  },
635
  {
636
+ "epoch": 0.2667146153615631,
637
+ "grad_norm": 19.203125,
638
+ "learning_rate": 4.167447087469563e-08,
639
+ "loss": 17.1029,
640
  "step": 4450
641
  },
642
  {
643
+ "epoch": 0.2697114087925919,
644
+ "grad_norm": 20.140625,
645
+ "learning_rate": 4.214272335643379e-08,
646
+ "loss": 17.2224,
647
  "step": 4500
648
  },
649
  {
650
+ "epoch": 0.27270820222362074,
651
+ "grad_norm": 21.28125,
652
+ "learning_rate": 4.261097583817194e-08,
653
+ "loss": 17.1751,
654
  "step": 4550
655
  },
656
  {
657
+ "epoch": 0.2757049956546495,
658
+ "grad_norm": 19.796875,
659
+ "learning_rate": 4.3079228319910095e-08,
660
+ "loss": 17.1403,
661
  "step": 4600
662
  },
663
  {
664
+ "epoch": 0.27870178908567833,
665
+ "grad_norm": 20.59375,
666
+ "learning_rate": 4.354748080164825e-08,
667
+ "loss": 17.1065,
668
  "step": 4650
669
  },
670
  {
671
+ "epoch": 0.2816985825167071,
672
+ "grad_norm": 24.171875,
673
+ "learning_rate": 4.40157332833864e-08,
674
+ "loss": 17.0031,
675
  "step": 4700
676
  },
677
  {
678
+ "epoch": 0.2846953759477359,
679
+ "grad_norm": 20.6875,
680
+ "learning_rate": 4.448398576512456e-08,
681
+ "loss": 17.1772,
682
  "step": 4750
683
  },
684
  {
685
+ "epoch": 0.28769216937876474,
686
+ "grad_norm": 22.0625,
687
+ "learning_rate": 4.4952238246862704e-08,
688
+ "loss": 17.0814,
689
  "step": 4800
690
  },
691
  {
692
+ "epoch": 0.2906889628097935,
693
+ "grad_norm": 19.765625,
694
+ "learning_rate": 4.5420490728600857e-08,
695
+ "loss": 17.1585,
696
  "step": 4850
697
  },
698
  {
699
+ "epoch": 0.2936857562408223,
700
+ "grad_norm": 21.796875,
701
+ "learning_rate": 4.588874321033901e-08,
702
+ "loss": 17.134,
703
  "step": 4900
704
  },
705
  {
706
+ "epoch": 0.2966825496718511,
707
+ "grad_norm": 19.53125,
708
+ "learning_rate": 4.635699569207717e-08,
709
+ "loss": 17.084,
710
  "step": 4950
711
  },
712
  {
713
+ "epoch": 0.2996793431028799,
714
+ "grad_norm": 24.828125,
715
+ "learning_rate": 4.682524817381532e-08,
716
+ "loss": 17.1089,
717
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  }
719
  ],
720
  "logging_steps": 50,
721
+ "max_steps": 16684,
722
  "num_input_tokens_seen": 0,
723
  "num_train_epochs": 1,
724
  "save_steps": 5000,
 
729
  "should_evaluate": false,
730
  "should_log": false,
731
  "should_save": true,
732
+ "should_training_stop": false
733
  },
734
  "attributes": {}
735
  }
736
  },
737
+ "total_flos": 8.730803076857856e+17,
738
  "train_batch_size": 8,
739
  "trial_name": null,
740
  "trial_params": null