Rakhman16 commited on
Commit
4ac1a0a
·
verified ·
1 Parent(s): 7c6168a

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d25040acf885f7e2920c47759e972a9c964c43aa59fe6576736ecb99705b9e7f
3
  size 891558696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39aa4cbab7de475a35a80d9b0d2693ecf4825d9d96171dccf8c56a8dfe863ab0
3
  size 891558696
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc841690d9edeb441c534cb39ba0e0b76571370a7dc70d911503af4a861ef3c3
3
  size 1783272762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd0048b75921b40f9628393fc371f1ea43397a61336f7fb405b2de81efe82eb9
3
  size 1783272762
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee7003d2d3db8b5d062c3280168e1b356926dfcb2c85d0b9bea95ac9bb64d84f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdbe2638c7caf1c99648b98db61ed244e5ab2b8152ba929e7b299ab487f42773
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d51f93e5e4e2970e1d4467bbc53489257074e326323db890e39b7a999de6e4d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bb3f2ff6f7e6781542bdfb9729073f4ea2e4dde4572d86a2813e058d1eb3526
3
  size 1064
last-checkpoint/tokenizer.json CHANGED
@@ -2,13 +2,13 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 256,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
9
  "padding": {
10
  "strategy": {
11
- "Fixed": 256
12
  },
13
  "direction": "Right",
14
  "pad_to_multiple_of": null,
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 128,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
9
  "padding": {
10
  "strategy": {
11
+ "Fixed": 128
12
  },
13
  "direction": "Right",
14
  "pad_to_multiple_of": null,
last-checkpoint/trainer_state.json CHANGED
@@ -1,346 +1,126 @@
1
  {
2
- "best_metric": 0.11325465887784958,
3
- "best_model_checkpoint": "./fine-tuned/checkpoint-1500",
4
- "epoch": 0.13173494928204452,
5
  "eval_steps": 100,
6
- "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.004391164976068151,
13
- "grad_norm": 0.4158438444137573,
14
- "learning_rate": 4.994510802740208e-05,
15
- "loss": 0.1393,
16
  "step": 50
17
  },
18
  {
19
- "epoch": 0.008782329952136302,
20
- "grad_norm": 0.45840761065483093,
21
- "learning_rate": 4.9890216054804146e-05,
22
- "loss": 0.1402,
23
  "step": 100
24
  },
25
  {
26
- "epoch": 0.008782329952136302,
27
- "eval_loss": 0.12196873873472214,
28
- "eval_runtime": 82.4004,
29
- "eval_samples_per_second": 54.126,
30
- "eval_steps_per_second": 13.531,
31
  "step": 100
32
  },
33
  {
34
- "epoch": 0.013173494928204453,
35
- "grad_norm": 0.6116240620613098,
36
- "learning_rate": 4.983532408220622e-05,
37
- "loss": 0.1401,
38
  "step": 150
39
  },
40
  {
41
- "epoch": 0.017564659904272605,
42
- "grad_norm": 7.667095184326172,
43
- "learning_rate": 4.9780432109608296e-05,
44
- "loss": 0.1256,
45
  "step": 200
46
  },
47
  {
48
- "epoch": 0.017564659904272605,
49
- "eval_loss": 0.11987816542387009,
50
- "eval_runtime": 83.2537,
51
- "eval_samples_per_second": 53.571,
52
- "eval_steps_per_second": 13.393,
53
  "step": 200
54
  },
55
  {
56
- "epoch": 0.021955824880340754,
57
- "grad_norm": 0.5275366902351379,
58
- "learning_rate": 4.972554013701037e-05,
59
- "loss": 0.1227,
60
  "step": 250
61
  },
62
  {
63
- "epoch": 0.026346989856408907,
64
- "grad_norm": 0.7326502203941345,
65
- "learning_rate": 4.967064816441244e-05,
66
- "loss": 0.1312,
67
  "step": 300
68
  },
69
  {
70
- "epoch": 0.026346989856408907,
71
- "eval_loss": 0.11818940937519073,
72
- "eval_runtime": 82.8316,
73
- "eval_samples_per_second": 53.844,
74
- "eval_steps_per_second": 13.461,
75
  "step": 300
76
  },
77
  {
78
- "epoch": 0.030738154832477056,
79
- "grad_norm": 0.5537102818489075,
80
- "learning_rate": 4.9615756191814514e-05,
81
- "loss": 0.1247,
82
  "step": 350
83
  },
84
  {
85
- "epoch": 0.03512931980854521,
86
- "grad_norm": 0.505450963973999,
87
- "learning_rate": 4.956086421921658e-05,
88
- "loss": 0.1244,
89
  "step": 400
90
  },
91
  {
92
- "epoch": 0.03512931980854521,
93
- "eval_loss": 0.11761000752449036,
94
- "eval_runtime": 83.0177,
95
- "eval_samples_per_second": 53.724,
96
- "eval_steps_per_second": 13.431,
97
  "step": 400
98
  },
99
  {
100
- "epoch": 0.039520484784613355,
101
- "grad_norm": 0.4453428089618683,
102
- "learning_rate": 4.950597224661866e-05,
103
- "loss": 0.1304,
104
  "step": 450
105
  },
106
  {
107
- "epoch": 0.04391164976068151,
108
- "grad_norm": 0.40634244680404663,
109
- "learning_rate": 4.9451080274020725e-05,
110
- "loss": 0.1343,
111
  "step": 500
112
  },
113
  {
114
- "epoch": 0.04391164976068151,
115
- "eval_loss": 0.1168845146894455,
116
- "eval_runtime": 82.6617,
117
- "eval_samples_per_second": 53.955,
118
- "eval_steps_per_second": 13.489,
119
  "step": 500
120
- },
121
- {
122
- "epoch": 0.04830281473674966,
123
- "grad_norm": 0.7192500233650208,
124
- "learning_rate": 4.93961883014228e-05,
125
- "loss": 0.1312,
126
- "step": 550
127
- },
128
- {
129
- "epoch": 0.052693979712817814,
130
- "grad_norm": 0.4769364595413208,
131
- "learning_rate": 4.9341296328824875e-05,
132
- "loss": 0.124,
133
- "step": 600
134
- },
135
- {
136
- "epoch": 0.052693979712817814,
137
- "eval_loss": 0.11658164858818054,
138
- "eval_runtime": 86.2919,
139
- "eval_samples_per_second": 51.685,
140
- "eval_steps_per_second": 12.921,
141
- "step": 600
142
- },
143
- {
144
- "epoch": 0.05708514468888596,
145
- "grad_norm": 0.45539864897727966,
146
- "learning_rate": 4.9286404356226944e-05,
147
- "loss": 0.1225,
148
- "step": 650
149
- },
150
- {
151
- "epoch": 0.06147630966495411,
152
- "grad_norm": 0.640469491481781,
153
- "learning_rate": 4.923151238362902e-05,
154
- "loss": 0.1403,
155
- "step": 700
156
- },
157
- {
158
- "epoch": 0.06147630966495411,
159
- "eval_loss": 0.11597927659749985,
160
- "eval_runtime": 82.556,
161
- "eval_samples_per_second": 54.024,
162
- "eval_steps_per_second": 13.506,
163
- "step": 700
164
- },
165
- {
166
- "epoch": 0.06586747464102226,
167
- "grad_norm": 0.39421921968460083,
168
- "learning_rate": 4.9176620411031094e-05,
169
- "loss": 0.1219,
170
- "step": 750
171
- },
172
- {
173
- "epoch": 0.07025863961709042,
174
- "grad_norm": 0.4776351749897003,
175
- "learning_rate": 4.912172843843317e-05,
176
- "loss": 0.1295,
177
- "step": 800
178
- },
179
- {
180
- "epoch": 0.07025863961709042,
181
- "eval_loss": 0.11582696437835693,
182
- "eval_runtime": 82.3981,
183
- "eval_samples_per_second": 54.127,
184
- "eval_steps_per_second": 13.532,
185
- "step": 800
186
- },
187
- {
188
- "epoch": 0.07464980459315856,
189
- "grad_norm": 0.5199089646339417,
190
- "learning_rate": 4.906683646583524e-05,
191
- "loss": 0.1185,
192
- "step": 850
193
- },
194
- {
195
- "epoch": 0.07904096956922671,
196
- "grad_norm": 0.44966959953308105,
197
- "learning_rate": 4.901194449323731e-05,
198
- "loss": 0.1132,
199
- "step": 900
200
- },
201
- {
202
- "epoch": 0.07904096956922671,
203
- "eval_loss": 0.11530017107725143,
204
- "eval_runtime": 82.5598,
205
- "eval_samples_per_second": 54.021,
206
- "eval_steps_per_second": 13.505,
207
- "step": 900
208
- },
209
- {
210
- "epoch": 0.08343213454529487,
211
- "grad_norm": 0.3631457984447479,
212
- "learning_rate": 4.895705252063939e-05,
213
- "loss": 0.1204,
214
- "step": 950
215
- },
216
- {
217
- "epoch": 0.08782329952136302,
218
- "grad_norm": 0.7695568799972534,
219
- "learning_rate": 4.890216054804146e-05,
220
- "loss": 0.1128,
221
- "step": 1000
222
- },
223
- {
224
- "epoch": 0.08782329952136302,
225
- "eval_loss": 0.11499012261629105,
226
- "eval_runtime": 82.4716,
227
- "eval_samples_per_second": 54.079,
228
- "eval_steps_per_second": 13.52,
229
- "step": 1000
230
- },
231
- {
232
- "epoch": 0.09221446449743116,
233
- "grad_norm": 0.3713476359844208,
234
- "learning_rate": 4.884726857544353e-05,
235
- "loss": 0.1251,
236
- "step": 1050
237
- },
238
- {
239
- "epoch": 0.09660562947349932,
240
- "grad_norm": 0.37777256965637207,
241
- "learning_rate": 4.8792376602845605e-05,
242
- "loss": 0.1153,
243
- "step": 1100
244
- },
245
- {
246
- "epoch": 0.09660562947349932,
247
- "eval_loss": 0.11525405198335648,
248
- "eval_runtime": 85.5526,
249
- "eval_samples_per_second": 52.132,
250
- "eval_steps_per_second": 13.033,
251
- "step": 1100
252
- },
253
- {
254
- "epoch": 0.10099679444956747,
255
- "grad_norm": 0.48114562034606934,
256
- "learning_rate": 4.873748463024767e-05,
257
- "loss": 0.1242,
258
- "step": 1150
259
- },
260
- {
261
- "epoch": 0.10538795942563563,
262
- "grad_norm": 0.7776908278465271,
263
- "learning_rate": 4.868259265764974e-05,
264
- "loss": 0.1237,
265
- "step": 1200
266
- },
267
- {
268
- "epoch": 0.10538795942563563,
269
- "eval_loss": 0.11445864289999008,
270
- "eval_runtime": 82.4683,
271
- "eval_samples_per_second": 54.081,
272
- "eval_steps_per_second": 13.52,
273
- "step": 1200
274
- },
275
- {
276
- "epoch": 0.10977912440170377,
277
- "grad_norm": 0.34126266837120056,
278
- "learning_rate": 4.8627700685051817e-05,
279
- "loss": 0.1331,
280
- "step": 1250
281
- },
282
- {
283
- "epoch": 0.11417028937777192,
284
- "grad_norm": 0.32315969467163086,
285
- "learning_rate": 4.857280871245389e-05,
286
- "loss": 0.1167,
287
- "step": 1300
288
- },
289
- {
290
- "epoch": 0.11417028937777192,
291
- "eval_loss": 0.11452117562294006,
292
- "eval_runtime": 82.4936,
293
- "eval_samples_per_second": 54.065,
294
- "eval_steps_per_second": 13.516,
295
- "step": 1300
296
- },
297
- {
298
- "epoch": 0.11856145435384008,
299
- "grad_norm": 0.7266770005226135,
300
- "learning_rate": 4.8517916739855967e-05,
301
- "loss": 0.1183,
302
- "step": 1350
303
- },
304
- {
305
- "epoch": 0.12295261932990822,
306
- "grad_norm": 0.4979361295700073,
307
- "learning_rate": 4.8463024767258035e-05,
308
- "loss": 0.126,
309
- "step": 1400
310
- },
311
- {
312
- "epoch": 0.12295261932990822,
313
- "eval_loss": 0.11409644037485123,
314
- "eval_runtime": 82.4146,
315
- "eval_samples_per_second": 54.117,
316
- "eval_steps_per_second": 13.529,
317
- "step": 1400
318
- },
319
- {
320
- "epoch": 0.12734378430597637,
321
- "grad_norm": 0.45280951261520386,
322
- "learning_rate": 4.840813279466011e-05,
323
- "loss": 0.1152,
324
- "step": 1450
325
- },
326
- {
327
- "epoch": 0.13173494928204452,
328
- "grad_norm": 0.9963550567626953,
329
- "learning_rate": 4.8353240822062185e-05,
330
- "loss": 0.1214,
331
- "step": 1500
332
- },
333
- {
334
- "epoch": 0.13173494928204452,
335
- "eval_loss": 0.11325465887784958,
336
- "eval_runtime": 82.4565,
337
- "eval_samples_per_second": 54.089,
338
- "eval_steps_per_second": 13.522,
339
- "step": 1500
340
  }
341
  ],
342
  "logging_steps": 50,
343
- "max_steps": 45544,
344
  "num_input_tokens_seen": 0,
345
  "num_train_epochs": 4,
346
  "save_steps": 500,
@@ -356,8 +136,8 @@
356
  "attributes": {}
357
  }
358
  },
359
- "total_flos": 3653747343360000.0,
360
- "train_batch_size": 4,
361
  "trial_name": null,
362
  "trial_params": null
363
  }
 
1
  {
2
+ "best_metric": 0.21807625889778137,
3
+ "best_model_checkpoint": "./fine-tuned/checkpoint-500",
4
+ "epoch": 0.3512469265893923,
5
  "eval_steps": 100,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.035124692658939236,
13
+ "grad_norm": 31298.91015625,
14
+ "learning_rate": 2.9736472241742796e-05,
15
+ "loss": 0.2772,
16
  "step": 50
17
  },
18
  {
19
+ "epoch": 0.07024938531787847,
20
+ "grad_norm": 28423.171875,
21
+ "learning_rate": 2.9472944483485594e-05,
22
+ "loss": 0.2575,
23
  "step": 100
24
  },
25
  {
26
+ "epoch": 0.07024938531787847,
27
+ "eval_loss": 0.22961987555027008,
28
+ "eval_runtime": 71.5175,
29
+ "eval_samples_per_second": 62.362,
30
+ "eval_steps_per_second": 1.958,
31
  "step": 100
32
  },
33
  {
34
+ "epoch": 0.1053740779768177,
35
+ "grad_norm": 28882.9609375,
36
+ "learning_rate": 2.9209416725228392e-05,
37
+ "loss": 0.24,
38
  "step": 150
39
  },
40
  {
41
+ "epoch": 0.14049877063575694,
42
+ "grad_norm": 44492.234375,
43
+ "learning_rate": 2.894588896697119e-05,
44
+ "loss": 0.2427,
45
  "step": 200
46
  },
47
  {
48
+ "epoch": 0.14049877063575694,
49
+ "eval_loss": 0.22477279603481293,
50
+ "eval_runtime": 71.1758,
51
+ "eval_samples_per_second": 62.662,
52
+ "eval_steps_per_second": 1.967,
53
  "step": 200
54
  },
55
  {
56
+ "epoch": 0.17562346329469616,
57
+ "grad_norm": 23385.271484375,
58
+ "learning_rate": 2.8682361208713985e-05,
59
+ "loss": 0.237,
60
  "step": 250
61
  },
62
  {
63
+ "epoch": 0.2107481559536354,
64
+ "grad_norm": 65184.7578125,
65
+ "learning_rate": 2.841883345045678e-05,
66
+ "loss": 0.2351,
67
  "step": 300
68
  },
69
  {
70
+ "epoch": 0.2107481559536354,
71
+ "eval_loss": 0.22264569997787476,
72
+ "eval_runtime": 71.2392,
73
+ "eval_samples_per_second": 62.606,
74
+ "eval_steps_per_second": 1.965,
75
  "step": 300
76
  },
77
  {
78
+ "epoch": 0.24587284861257463,
79
+ "grad_norm": 26510.09375,
80
+ "learning_rate": 2.8155305692199578e-05,
81
+ "loss": 0.2387,
82
  "step": 350
83
  },
84
  {
85
+ "epoch": 0.2809975412715139,
86
+ "grad_norm": 35873.625,
87
+ "learning_rate": 2.7891777933942376e-05,
88
+ "loss": 0.239,
89
  "step": 400
90
  },
91
  {
92
+ "epoch": 0.2809975412715139,
93
+ "eval_loss": 0.22040367126464844,
94
+ "eval_runtime": 71.1652,
95
+ "eval_samples_per_second": 62.671,
96
+ "eval_steps_per_second": 1.967,
97
  "step": 400
98
  },
99
  {
100
+ "epoch": 0.31612223393045313,
101
+ "grad_norm": 190454.703125,
102
+ "learning_rate": 2.7628250175685175e-05,
103
+ "loss": 0.2343,
104
  "step": 450
105
  },
106
  {
107
+ "epoch": 0.3512469265893923,
108
+ "grad_norm": 27248.146484375,
109
+ "learning_rate": 2.736472241742797e-05,
110
+ "loss": 0.2349,
111
  "step": 500
112
  },
113
  {
114
+ "epoch": 0.3512469265893923,
115
+ "eval_loss": 0.21807625889778137,
116
+ "eval_runtime": 71.5449,
117
+ "eval_samples_per_second": 62.338,
118
+ "eval_steps_per_second": 1.957,
119
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  }
121
  ],
122
  "logging_steps": 50,
123
+ "max_steps": 5692,
124
  "num_input_tokens_seen": 0,
125
  "num_train_epochs": 4,
126
  "save_steps": 500,
 
136
  "attributes": {}
137
  }
138
  },
139
+ "total_flos": 4871663124480000.0,
140
+ "train_batch_size": 32,
141
  "trial_name": null,
142
  "trial_params": null
143
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a9eac9b6dfa8d64bfb6721f59fcdb734e155bcd97bd566040ad0e9d54879ad8
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf0bce22946039d2fc9fa9044d9964dcf0976913488528c8ab2c151752735b36
3
  size 5432