ma2za commited on
Commit
929e3ab
·
1 Parent(s): ce2c253

Training in progress, step 600

Browse files
config.json CHANGED
@@ -39,7 +39,7 @@
39
  "pad_token_id": 1,
40
  "position_embedding_type": "absolute",
41
  "torch_dtype": "float32",
42
- "transformers_version": "4.30.2",
43
  "type_vocab_size": 1,
44
  "use_cache": true,
45
  "vocab_size": 250002
 
39
  "pad_token_id": 1,
40
  "position_embedding_type": "absolute",
41
  "torch_dtype": "float32",
42
+ "transformers_version": "4.31.0",
43
  "type_vocab_size": 1,
44
  "use_cache": true,
45
  "vocab_size": 250002
last-checkpoint/config.json CHANGED
@@ -39,7 +39,7 @@
39
  "pad_token_id": 1,
40
  "position_embedding_type": "absolute",
41
  "torch_dtype": "float32",
42
- "transformers_version": "4.30.2",
43
  "type_vocab_size": 1,
44
  "use_cache": true,
45
  "vocab_size": 250002
 
39
  "pad_token_id": 1,
40
  "position_embedding_type": "absolute",
41
  "torch_dtype": "float32",
42
+ "transformers_version": "4.31.0",
43
  "type_vocab_size": 1,
44
  "use_cache": true,
45
  "vocab_size": 250002
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5d5ecb7cea99166e5331fd83af0abf4700f34f8fa7c55069b55e5cc11ef77b6e
3
  size 2117841669
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aafda980146253cd7b82b0dd2e2a1211ce96ed9fccbf804597a542c6abfab59e
3
  size 2117841669
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a95dc90cf2c6fb077a16d6d006d07123ca58d93e9477b64a60af5efde8712c0f
3
- size 2268278205
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:925941313b310518092ca4c0ce765c6ed1e75d6a07a200cdf61def1fa48fdef8
3
+ size 2268273785
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60950be020388f13bb3603ac853f8ba2158b6fdee7c4c316c0178e90051ada25
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a45a08059f2351fc7b31a0c40f1b65158be9ff667b5b2468faf63f546d65222b
3
  size 14575
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd884e1d2e2e8520444f4aed44797bb12512ec301ef2843eb0c7f004efc2a56e
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90b1e78b64b76d121ce0a17312e4bc1a4adeafa3fb2ee51db16618bc84d92166
3
  size 627
last-checkpoint/trainer_state.json CHANGED
@@ -1,440 +1,32 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.2591815320041972,
5
- "global_step": 12000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
- {
11
- "epoch": 0.04,
12
- "eval_accuracy": 0.7687216869069928,
13
- "eval_f1": 0.7665424923390772,
14
- "eval_loss": 0.9364227652549744,
15
- "eval_runtime": 684.6631,
16
- "eval_samples_per_second": 197.963,
17
- "eval_steps_per_second": 0.774,
18
- "step": 400
19
- },
20
  {
21
  "epoch": 0.05,
22
- "learning_rate": 9.994774396642183e-06,
23
- "loss": 1.2511,
24
  "step": 500
25
  },
26
  {
27
- "epoch": 0.08,
28
- "eval_accuracy": 0.8296492496569228,
29
- "eval_f1": 0.8293241244922317,
30
- "eval_loss": 0.8166272044181824,
31
- "eval_runtime": 684.0816,
32
- "eval_samples_per_second": 198.131,
33
- "eval_steps_per_second": 0.775,
34
- "step": 800
35
- },
36
- {
37
- "epoch": 0.1,
38
- "learning_rate": 9.989538300104932e-06,
39
- "loss": 0.8696,
40
- "step": 1000
41
- },
42
- {
43
- "epoch": 0.13,
44
- "eval_accuracy": 0.846795732562086,
45
- "eval_f1": 0.8476031571339556,
46
- "eval_loss": 0.7674374580383301,
47
- "eval_runtime": 684.9072,
48
- "eval_samples_per_second": 197.892,
49
- "eval_steps_per_second": 0.774,
50
- "step": 1200
51
- },
52
- {
53
- "epoch": 0.16,
54
- "learning_rate": 9.98429171038825e-06,
55
- "loss": 0.8001,
56
- "step": 1500
57
- },
58
- {
59
- "epoch": 0.17,
60
- "eval_accuracy": 0.8562469565730644,
61
- "eval_f1": 0.8555885462291756,
62
- "eval_loss": 0.7412897348403931,
63
- "eval_runtime": 685.4288,
64
- "eval_samples_per_second": 197.742,
65
- "eval_steps_per_second": 0.773,
66
- "step": 1600
67
- },
68
- {
69
- "epoch": 0.21,
70
- "learning_rate": 9.97906610703043e-06,
71
- "loss": 0.7653,
72
- "step": 2000
73
- },
74
- {
75
- "epoch": 0.21,
76
- "eval_accuracy": 0.8584087119479408,
77
- "eval_f1": 0.8589656756888883,
78
- "eval_loss": 0.7346311807632446,
79
- "eval_runtime": 1253.8639,
80
- "eval_samples_per_second": 108.096,
81
- "eval_steps_per_second": 0.423,
82
- "step": 2000
83
- },
84
- {
85
- "epoch": 0.25,
86
- "eval_accuracy": 0.8606590033791262,
87
- "eval_f1": 0.8613764873641668,
88
- "eval_loss": 0.7210016846656799,
89
- "eval_runtime": 749.1336,
90
- "eval_samples_per_second": 180.926,
91
- "eval_steps_per_second": 0.707,
92
- "step": 2400
93
- },
94
- {
95
- "epoch": 0.26,
96
- "learning_rate": 9.973840503672613e-06,
97
- "loss": 0.7517,
98
- "step": 2500
99
- },
100
- {
101
- "epoch": 0.29,
102
- "eval_accuracy": 0.8668491493160589,
103
- "eval_f1": 0.8662439909072627,
104
- "eval_loss": 0.7115087509155273,
105
- "eval_runtime": 748.5848,
106
- "eval_samples_per_second": 181.059,
107
- "eval_steps_per_second": 0.708,
108
- "step": 2800
109
- },
110
- {
111
- "epoch": 0.31,
112
- "learning_rate": 9.968604407135362e-06,
113
- "loss": 0.7386,
114
- "step": 3000
115
- },
116
- {
117
- "epoch": 0.34,
118
- "eval_accuracy": 0.8676090837993773,
119
- "eval_f1": 0.8684186652379281,
120
- "eval_loss": 0.7080894708633423,
121
- "eval_runtime": 748.8743,
122
- "eval_samples_per_second": 180.989,
123
- "eval_steps_per_second": 0.708,
124
- "step": 3200
125
- },
126
- {
127
- "epoch": 0.37,
128
- "learning_rate": 9.963389296956978e-06,
129
- "loss": 0.7238,
130
- "step": 3500
131
- },
132
- {
133
- "epoch": 0.38,
134
- "eval_accuracy": 0.8690256607003202,
135
- "eval_f1": 0.8697884276426098,
136
- "eval_loss": 0.7015646696090698,
137
- "eval_runtime": 666.7136,
138
- "eval_samples_per_second": 203.293,
139
- "eval_steps_per_second": 0.795,
140
- "step": 3600
141
- },
142
- {
143
- "epoch": 0.42,
144
- "learning_rate": 9.958153200419727e-06,
145
- "loss": 0.7216,
146
- "step": 4000
147
- },
148
- {
149
- "epoch": 0.42,
150
- "eval_accuracy": 0.8712390621080435,
151
- "eval_f1": 0.8724240607774923,
152
- "eval_loss": 0.6999966502189636,
153
- "eval_runtime": 666.1677,
154
- "eval_samples_per_second": 203.459,
155
- "eval_steps_per_second": 0.796,
156
- "step": 4000
157
- },
158
- {
159
- "epoch": 0.46,
160
- "eval_accuracy": 0.87027992149803,
161
- "eval_f1": 0.8704618561200863,
162
- "eval_loss": 0.6968262195587158,
163
- "eval_runtime": 666.1311,
164
- "eval_samples_per_second": 203.47,
165
- "eval_steps_per_second": 0.796,
166
- "step": 4400
167
- },
168
- {
169
- "epoch": 0.47,
170
- "learning_rate": 9.952906610703044e-06,
171
- "loss": 0.7123,
172
- "step": 4500
173
- },
174
- {
175
- "epoch": 0.5,
176
- "eval_accuracy": 0.8710250999719635,
177
- "eval_f1": 0.8710228151932407,
178
- "eval_loss": 0.6919674277305603,
179
- "eval_runtime": 666.6562,
180
- "eval_samples_per_second": 203.31,
181
- "eval_steps_per_second": 0.795,
182
- "step": 4800
183
- },
184
- {
185
- "epoch": 0.52,
186
- "learning_rate": 9.94766002098636e-06,
187
- "loss": 0.708,
188
- "step": 5000
189
- },
190
- {
191
- "epoch": 0.55,
192
- "eval_accuracy": 0.8743452020835485,
193
- "eval_f1": 0.8736511770234788,
194
- "eval_loss": 0.6875823736190796,
195
- "eval_runtime": 666.8888,
196
- "eval_samples_per_second": 203.239,
197
- "eval_steps_per_second": 0.795,
198
- "step": 5200
199
- },
200
- {
201
- "epoch": 0.58,
202
- "learning_rate": 9.94242392444911e-06,
203
- "loss": 0.705,
204
- "step": 5500
205
- },
206
- {
207
- "epoch": 0.59,
208
- "eval_accuracy": 0.8718293024834364,
209
- "eval_f1": 0.8721073032565451,
210
- "eval_loss": 0.6912590861320496,
211
- "eval_runtime": 666.9048,
212
- "eval_samples_per_second": 203.234,
213
- "eval_steps_per_second": 0.795,
214
- "step": 5600
215
- },
216
- {
217
- "epoch": 0.63,
218
- "learning_rate": 9.937208814270725e-06,
219
- "loss": 0.6986,
220
- "step": 6000
221
- },
222
- {
223
- "epoch": 0.63,
224
- "eval_accuracy": 0.8746108102524753,
225
- "eval_f1": 0.8755518143887745,
226
- "eval_loss": 0.6850203275680542,
227
- "eval_runtime": 670.2388,
228
- "eval_samples_per_second": 202.223,
229
- "eval_steps_per_second": 0.791,
230
- "step": 6000
231
- },
232
- {
233
- "epoch": 0.67,
234
- "eval_accuracy": 0.8748395283979401,
235
- "eval_f1": 0.8750168188156892,
236
- "eval_loss": 0.6849112510681152,
237
- "eval_runtime": 670.7573,
238
- "eval_samples_per_second": 202.067,
239
- "eval_steps_per_second": 0.79,
240
- "step": 6400
241
- },
242
- {
243
- "epoch": 0.68,
244
- "learning_rate": 9.93196222455404e-06,
245
- "loss": 0.6986,
246
- "step": 6500
247
- },
248
- {
249
- "epoch": 0.71,
250
- "eval_accuracy": 0.8763077513317299,
251
- "eval_f1": 0.8766362923981238,
252
- "eval_loss": 0.6798161864280701,
253
- "eval_runtime": 669.9612,
254
- "eval_samples_per_second": 202.307,
255
- "eval_steps_per_second": 0.791,
256
- "step": 6800
257
- },
258
- {
259
- "epoch": 0.73,
260
- "learning_rate": 9.926715634837356e-06,
261
- "loss": 0.6948,
262
- "step": 7000
263
- },
264
- {
265
- "epoch": 0.76,
266
- "eval_accuracy": 0.8761897032566512,
267
- "eval_f1": 0.8771094350236266,
268
- "eval_loss": 0.6797441840171814,
269
- "eval_runtime": 670.6385,
270
- "eval_samples_per_second": 202.103,
271
- "eval_steps_per_second": 0.79,
272
- "step": 7200
273
- },
274
- {
275
- "epoch": 0.79,
276
- "learning_rate": 9.921469045120673e-06,
277
- "loss": 0.694,
278
- "step": 7500
279
- },
280
- {
281
- "epoch": 0.8,
282
- "eval_accuracy": 0.8757765349938763,
283
- "eval_f1": 0.875946128594897,
284
- "eval_loss": 0.6784085631370544,
285
- "eval_runtime": 670.4256,
286
- "eval_samples_per_second": 202.167,
287
- "eval_steps_per_second": 0.791,
288
- "step": 7600
289
- },
290
- {
291
- "epoch": 0.84,
292
- "learning_rate": 9.916232948583422e-06,
293
- "loss": 0.6907,
294
- "step": 8000
295
- },
296
- {
297
- "epoch": 0.84,
298
- "eval_accuracy": 0.8782776785846036,
299
- "eval_f1": 0.8778960944609727,
300
- "eval_loss": 0.6746197938919067,
301
- "eval_runtime": 670.9876,
302
- "eval_samples_per_second": 201.998,
303
- "eval_steps_per_second": 0.79,
304
- "step": 8000
305
- },
306
- {
307
- "epoch": 0.88,
308
- "eval_accuracy": 0.8770529298056634,
309
- "eval_f1": 0.8765354326008715,
310
- "eval_loss": 0.6747441291809082,
311
- "eval_runtime": 1255.3569,
312
- "eval_samples_per_second": 107.968,
313
- "eval_steps_per_second": 0.422,
314
- "step": 8400
315
- },
316
- {
317
- "epoch": 0.89,
318
- "learning_rate": 9.911007345225604e-06,
319
- "loss": 0.6879,
320
- "step": 8500
321
- },
322
- {
323
- "epoch": 0.94,
324
- "learning_rate": 9.905781741867787e-06,
325
- "loss": 0.6875,
326
- "step": 9000
327
- },
328
- {
329
- "epoch": 0.94,
330
- "eval_accuracy": 0.8780932284672933,
331
- "eval_f1": 0.879247713093542,
332
- "eval_loss": 0.6763164401054382,
333
- "eval_runtime": 687.9898,
334
- "eval_samples_per_second": 197.006,
335
- "eval_steps_per_second": 0.77,
336
- "step": 9000
337
- },
338
- {
339
- "epoch": 1.0,
340
- "learning_rate": 9.900545645330536e-06,
341
- "loss": 0.686,
342
- "step": 9500
343
- },
344
- {
345
- "epoch": 1.01,
346
- "eval_accuracy": 0.8796352314480071,
347
- "eval_f1": 0.8798913903465643,
348
- "eval_loss": 0.6714671850204468,
349
- "eval_runtime": 688.5552,
350
- "eval_samples_per_second": 196.844,
351
- "eval_steps_per_second": 0.77,
352
- "step": 9600
353
- },
354
- {
355
- "epoch": 1.05,
356
- "learning_rate": 9.895309548793285e-06,
357
- "loss": 0.6828,
358
- "step": 10000
359
- },
360
- {
361
- "epoch": 1.07,
362
- "eval_accuracy": 0.8783293246174505,
363
- "eval_f1": 0.8770525463818627,
364
- "eval_loss": 0.6699363589286804,
365
- "eval_runtime": 734.0798,
366
- "eval_samples_per_second": 184.637,
367
- "eval_steps_per_second": 0.722,
368
- "step": 10200
369
- },
370
- {
371
- "epoch": 1.1,
372
- "learning_rate": 9.890073452256035e-06,
373
- "loss": 0.679,
374
- "step": 10500
375
- },
376
- {
377
- "epoch": 1.13,
378
- "eval_accuracy": 0.8791187711195384,
379
- "eval_f1": 0.8798211781418543,
380
- "eval_loss": 0.6693193912506104,
381
- "eval_runtime": 733.9523,
382
- "eval_samples_per_second": 184.669,
383
- "eval_steps_per_second": 0.722,
384
- "step": 10800
385
- },
386
- {
387
- "epoch": 1.15,
388
- "learning_rate": 9.884837355718784e-06,
389
- "loss": 0.679,
390
- "step": 11000
391
- },
392
- {
393
- "epoch": 1.18,
394
- "eval_accuracy": 0.8803435198984787,
395
- "eval_f1": 0.8801011265002094,
396
- "eval_loss": 0.6679297685623169,
397
- "eval_runtime": 684.5882,
398
- "eval_samples_per_second": 197.985,
399
- "eval_steps_per_second": 0.774,
400
- "step": 11200
401
- },
402
- {
403
- "epoch": 1.21,
404
- "learning_rate": 9.8795907660021e-06,
405
- "loss": 0.678,
406
- "step": 11500
407
- },
408
- {
409
- "epoch": 1.22,
410
- "eval_accuracy": 0.8807862001800233,
411
- "eval_f1": 0.8812929844052909,
412
- "eval_loss": 0.6662477254867554,
413
- "eval_runtime": 684.0407,
414
- "eval_samples_per_second": 198.143,
415
- "eval_steps_per_second": 0.775,
416
- "step": 11600
417
- },
418
- {
419
- "epoch": 1.26,
420
- "learning_rate": 9.874354669464849e-06,
421
- "loss": 0.6767,
422
- "step": 12000
423
- },
424
- {
425
- "epoch": 1.26,
426
- "eval_accuracy": 0.8794138913072349,
427
- "eval_f1": 0.8799114820136512,
428
- "eval_loss": 0.6668325662612915,
429
- "eval_runtime": 685.344,
430
- "eval_samples_per_second": 197.766,
431
- "eval_steps_per_second": 0.773,
432
- "step": 12000
433
  }
434
  ],
435
  "max_steps": 953000,
436
  "num_train_epochs": 100,
437
- "total_flos": 5.0440310829817606e+17,
438
  "trial_name": null,
439
  "trial_params": null
440
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.06295907660020986,
5
+ "global_step": 600,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
 
 
 
 
 
 
 
 
 
 
10
  {
11
  "epoch": 0.05,
12
+ "learning_rate": 9.99476390346275e-06,
13
+ "loss": 1.6498,
14
  "step": 500
15
  },
16
  {
17
+ "epoch": 0.06,
18
+ "eval_accuracy": 0.430292611666101,
19
+ "eval_f1": 0.31503797170432934,
20
+ "eval_loss": 1.56297767162323,
21
+ "eval_runtime": 713.0904,
22
+ "eval_samples_per_second": 190.071,
23
+ "eval_steps_per_second": 0.743,
24
+ "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
  ],
27
  "max_steps": 953000,
28
  "num_train_epochs": 100,
29
+ "total_flos": 2.544857570206464e+16,
30
  "trial_name": null,
31
  "trial_params": null
32
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a52f292489e814d9274228a24a069717d8777c98bc256de62a83046896fea77e
3
- size 3963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14dd9d3e983872605882053c5baf561625d8c8afd79b7a7ea8eef6f99645bedb
3
+ size 4027
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a95dc90cf2c6fb077a16d6d006d07123ca58d93e9477b64a60af5efde8712c0f
3
- size 2268278205
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:925941313b310518092ca4c0ce765c6ed1e75d6a07a200cdf61def1fa48fdef8
3
+ size 2268273785
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a52f292489e814d9274228a24a069717d8777c98bc256de62a83046896fea77e
3
- size 3963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14dd9d3e983872605882053c5baf561625d8c8afd79b7a7ea8eef6f99645bedb
3
+ size 4027