Upload folder using huggingface_hub

#1
config.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "$",
13
+ "1": "&",
14
+ "10": "8",
15
+ "11": "9",
16
+ "12": "@",
17
+ "13": "A",
18
+ "14": "B",
19
+ "15": "C",
20
+ "16": "D",
21
+ "17": "E",
22
+ "18": "F",
23
+ "19": "G",
24
+ "2": "0",
25
+ "20": "H",
26
+ "21": "I",
27
+ "22": "J",
28
+ "23": "K",
29
+ "24": "L",
30
+ "25": "M",
31
+ "26": "N",
32
+ "27": "P",
33
+ "28": "Q",
34
+ "29": "R",
35
+ "3": "1",
36
+ "30": "S",
37
+ "31": "T",
38
+ "32": "U",
39
+ "33": "V",
40
+ "34": "W",
41
+ "35": "X",
42
+ "36": "Y",
43
+ "37": "Z",
44
+ "4": "2",
45
+ "5": "3",
46
+ "6": "4",
47
+ "7": "5",
48
+ "8": "6",
49
+ "9": "7"
50
+ },
51
+ "image_size": 224,
52
+ "initializer_range": 0.02,
53
+ "intermediate_size": 3072,
54
+ "label2id": {
55
+ "$": "0",
56
+ "&": "1",
57
+ "0": "2",
58
+ "1": "3",
59
+ "2": "4",
60
+ "3": "5",
61
+ "4": "6",
62
+ "5": "7",
63
+ "6": "8",
64
+ "7": "9",
65
+ "8": "10",
66
+ "9": "11",
67
+ "@": "12",
68
+ "A": "13",
69
+ "B": "14",
70
+ "C": "15",
71
+ "D": "16",
72
+ "E": "17",
73
+ "F": "18",
74
+ "G": "19",
75
+ "H": "20",
76
+ "I": "21",
77
+ "J": "22",
78
+ "K": "23",
79
+ "L": "24",
80
+ "M": "25",
81
+ "N": "26",
82
+ "P": "27",
83
+ "Q": "28",
84
+ "R": "29",
85
+ "S": "30",
86
+ "T": "31",
87
+ "U": "32",
88
+ "V": "33",
89
+ "W": "34",
90
+ "X": "35",
91
+ "Y": "36",
92
+ "Z": "37"
93
+ },
94
+ "layer_norm_eps": 1e-12,
95
+ "model_type": "vit",
96
+ "num_attention_heads": 12,
97
+ "num_channels": 3,
98
+ "num_hidden_layers": 12,
99
+ "patch_size": 16,
100
+ "problem_type": "single_label_classification",
101
+ "qkv_bias": true,
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.31.0"
104
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:988da3d053e5377b39bb3d8ff5a44e37cb92cf461a3f9c8d6926266973875b87
3
+ size 686740357
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTFeatureExtractor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f245fdf60dcda05f6d43b8d978c2a3444607fd13a6776a617a62f6919ab1afc7
3
+ size 343379437
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:040e1fc422c0a8be218aff6765f3632a90cc5bfe30d3447acf55f588b1fbf767
3
+ size 14575
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9459d5dc5e462d70441715c28d43c4681f3df8583f6a53745a7dc38d7f20ad4
3
+ size 627
trainer_state.json ADDED
@@ -0,0 +1,1105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.2977927625179291,
3
+ "best_model_checkpoint": "./vit-base-beans/checkpoint-1280",
4
+ "epoch": 3.963963963963964,
5
+ "global_step": 1320,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.03,
12
+ "learning_rate": 0.0001984984984984985,
13
+ "loss": 3.6277,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 0.06,
18
+ "learning_rate": 0.00019699699699699701,
19
+ "loss": 3.5588,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 0.09,
24
+ "learning_rate": 0.0001954954954954955,
25
+ "loss": 3.4115,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 0.12,
30
+ "learning_rate": 0.00019399399399399402,
31
+ "loss": 3.2498,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.12,
36
+ "eval_accuracy": 0.22024983563445102,
37
+ "eval_loss": 3.1079554557800293,
38
+ "eval_runtime": 25.4224,
39
+ "eval_samples_per_second": 59.829,
40
+ "eval_steps_per_second": 7.513,
41
+ "step": 40
42
+ },
43
+ {
44
+ "epoch": 0.15,
45
+ "learning_rate": 0.0001924924924924925,
46
+ "loss": 2.967,
47
+ "step": 50
48
+ },
49
+ {
50
+ "epoch": 0.18,
51
+ "learning_rate": 0.000190990990990991,
52
+ "loss": 2.8969,
53
+ "step": 60
54
+ },
55
+ {
56
+ "epoch": 0.21,
57
+ "learning_rate": 0.0001894894894894895,
58
+ "loss": 2.7153,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.24,
63
+ "learning_rate": 0.000187987987987988,
64
+ "loss": 2.5076,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.24,
69
+ "eval_accuracy": 0.4339250493096647,
70
+ "eval_loss": 2.4336094856262207,
71
+ "eval_runtime": 25.9183,
72
+ "eval_samples_per_second": 58.684,
73
+ "eval_steps_per_second": 7.369,
74
+ "step": 80
75
+ },
76
+ {
77
+ "epoch": 0.27,
78
+ "learning_rate": 0.0001864864864864865,
79
+ "loss": 2.3504,
80
+ "step": 90
81
+ },
82
+ {
83
+ "epoch": 0.3,
84
+ "learning_rate": 0.000184984984984985,
85
+ "loss": 2.2198,
86
+ "step": 100
87
+ },
88
+ {
89
+ "epoch": 0.33,
90
+ "learning_rate": 0.0001834834834834835,
91
+ "loss": 2.1227,
92
+ "step": 110
93
+ },
94
+ {
95
+ "epoch": 0.36,
96
+ "learning_rate": 0.000181981981981982,
97
+ "loss": 1.9345,
98
+ "step": 120
99
+ },
100
+ {
101
+ "epoch": 0.36,
102
+ "eval_accuracy": 0.6226166995397765,
103
+ "eval_loss": 1.8898930549621582,
104
+ "eval_runtime": 25.7004,
105
+ "eval_samples_per_second": 59.182,
106
+ "eval_steps_per_second": 7.432,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 0.39,
111
+ "learning_rate": 0.0001804804804804805,
112
+ "loss": 1.8725,
113
+ "step": 130
114
+ },
115
+ {
116
+ "epoch": 0.42,
117
+ "learning_rate": 0.00017897897897897898,
118
+ "loss": 1.6834,
119
+ "step": 140
120
+ },
121
+ {
122
+ "epoch": 0.45,
123
+ "learning_rate": 0.0001774774774774775,
124
+ "loss": 1.6183,
125
+ "step": 150
126
+ },
127
+ {
128
+ "epoch": 0.48,
129
+ "learning_rate": 0.000175975975975976,
130
+ "loss": 1.4224,
131
+ "step": 160
132
+ },
133
+ {
134
+ "epoch": 0.48,
135
+ "eval_accuracy": 0.673241288625904,
136
+ "eval_loss": 1.5380345582962036,
137
+ "eval_runtime": 26.3631,
138
+ "eval_samples_per_second": 57.694,
139
+ "eval_steps_per_second": 7.245,
140
+ "step": 160
141
+ },
142
+ {
143
+ "epoch": 0.51,
144
+ "learning_rate": 0.0001744744744744745,
145
+ "loss": 1.5553,
146
+ "step": 170
147
+ },
148
+ {
149
+ "epoch": 0.54,
150
+ "learning_rate": 0.000172972972972973,
151
+ "loss": 1.3173,
152
+ "step": 180
153
+ },
154
+ {
155
+ "epoch": 0.57,
156
+ "learning_rate": 0.00017147147147147148,
157
+ "loss": 1.2917,
158
+ "step": 190
159
+ },
160
+ {
161
+ "epoch": 0.6,
162
+ "learning_rate": 0.00016996996996997,
163
+ "loss": 1.2626,
164
+ "step": 200
165
+ },
166
+ {
167
+ "epoch": 0.6,
168
+ "eval_accuracy": 0.7370151216305062,
169
+ "eval_loss": 1.2467007637023926,
170
+ "eval_runtime": 25.735,
171
+ "eval_samples_per_second": 59.102,
172
+ "eval_steps_per_second": 7.422,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 0.63,
177
+ "learning_rate": 0.00016846846846846846,
178
+ "loss": 1.0941,
179
+ "step": 210
180
+ },
181
+ {
182
+ "epoch": 0.66,
183
+ "learning_rate": 0.00016696696696696697,
184
+ "loss": 1.1499,
185
+ "step": 220
186
+ },
187
+ {
188
+ "epoch": 0.69,
189
+ "learning_rate": 0.00016546546546546546,
190
+ "loss": 1.085,
191
+ "step": 230
192
+ },
193
+ {
194
+ "epoch": 0.72,
195
+ "learning_rate": 0.00016396396396396395,
196
+ "loss": 1.0447,
197
+ "step": 240
198
+ },
199
+ {
200
+ "epoch": 0.72,
201
+ "eval_accuracy": 0.7633136094674556,
202
+ "eval_loss": 1.0867702960968018,
203
+ "eval_runtime": 25.9624,
204
+ "eval_samples_per_second": 58.585,
205
+ "eval_steps_per_second": 7.357,
206
+ "step": 240
207
+ },
208
+ {
209
+ "epoch": 0.75,
210
+ "learning_rate": 0.00016246246246246247,
211
+ "loss": 0.9169,
212
+ "step": 250
213
+ },
214
+ {
215
+ "epoch": 0.78,
216
+ "learning_rate": 0.00016096096096096096,
217
+ "loss": 1.0707,
218
+ "step": 260
219
+ },
220
+ {
221
+ "epoch": 0.81,
222
+ "learning_rate": 0.00015945945945945947,
223
+ "loss": 0.8938,
224
+ "step": 270
225
+ },
226
+ {
227
+ "epoch": 0.84,
228
+ "learning_rate": 0.00015795795795795796,
229
+ "loss": 0.9403,
230
+ "step": 280
231
+ },
232
+ {
233
+ "epoch": 0.84,
234
+ "eval_accuracy": 0.8007889546351085,
235
+ "eval_loss": 0.8639808297157288,
236
+ "eval_runtime": 25.817,
237
+ "eval_samples_per_second": 58.915,
238
+ "eval_steps_per_second": 7.398,
239
+ "step": 280
240
+ },
241
+ {
242
+ "epoch": 0.87,
243
+ "learning_rate": 0.00015645645645645645,
244
+ "loss": 0.8592,
245
+ "step": 290
246
+ },
247
+ {
248
+ "epoch": 0.9,
249
+ "learning_rate": 0.00015495495495495496,
250
+ "loss": 0.9383,
251
+ "step": 300
252
+ },
253
+ {
254
+ "epoch": 0.93,
255
+ "learning_rate": 0.00015345345345345345,
256
+ "loss": 0.8804,
257
+ "step": 310
258
+ },
259
+ {
260
+ "epoch": 0.96,
261
+ "learning_rate": 0.00015195195195195194,
262
+ "loss": 0.7259,
263
+ "step": 320
264
+ },
265
+ {
266
+ "epoch": 0.96,
267
+ "eval_accuracy": 0.8198553583168968,
268
+ "eval_loss": 0.7540761828422546,
269
+ "eval_runtime": 25.7,
270
+ "eval_samples_per_second": 59.183,
271
+ "eval_steps_per_second": 7.432,
272
+ "step": 320
273
+ },
274
+ {
275
+ "epoch": 0.99,
276
+ "learning_rate": 0.00015045045045045046,
277
+ "loss": 0.7308,
278
+ "step": 330
279
+ },
280
+ {
281
+ "epoch": 1.02,
282
+ "learning_rate": 0.00014894894894894895,
283
+ "loss": 0.6628,
284
+ "step": 340
285
+ },
286
+ {
287
+ "epoch": 1.05,
288
+ "learning_rate": 0.00014744744744744746,
289
+ "loss": 0.6611,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 1.08,
294
+ "learning_rate": 0.00014594594594594595,
295
+ "loss": 0.7276,
296
+ "step": 360
297
+ },
298
+ {
299
+ "epoch": 1.08,
300
+ "eval_accuracy": 0.8500986193293886,
301
+ "eval_loss": 0.6682031154632568,
302
+ "eval_runtime": 26.3636,
303
+ "eval_samples_per_second": 57.693,
304
+ "eval_steps_per_second": 7.245,
305
+ "step": 360
306
+ },
307
+ {
308
+ "epoch": 1.11,
309
+ "learning_rate": 0.00014444444444444444,
310
+ "loss": 0.6113,
311
+ "step": 370
312
+ },
313
+ {
314
+ "epoch": 1.14,
315
+ "learning_rate": 0.00014294294294294295,
316
+ "loss": 0.545,
317
+ "step": 380
318
+ },
319
+ {
320
+ "epoch": 1.17,
321
+ "learning_rate": 0.00014144144144144144,
322
+ "loss": 0.682,
323
+ "step": 390
324
+ },
325
+ {
326
+ "epoch": 1.2,
327
+ "learning_rate": 0.00013993993993993996,
328
+ "loss": 0.5643,
329
+ "step": 400
330
+ },
331
+ {
332
+ "epoch": 1.2,
333
+ "eval_accuracy": 0.8303747534516766,
334
+ "eval_loss": 0.6452277302742004,
335
+ "eval_runtime": 25.857,
336
+ "eval_samples_per_second": 58.824,
337
+ "eval_steps_per_second": 7.387,
338
+ "step": 400
339
+ },
340
+ {
341
+ "epoch": 1.23,
342
+ "learning_rate": 0.00013843843843843845,
343
+ "loss": 0.5002,
344
+ "step": 410
345
+ },
346
+ {
347
+ "epoch": 1.26,
348
+ "learning_rate": 0.00013693693693693693,
349
+ "loss": 0.5067,
350
+ "step": 420
351
+ },
352
+ {
353
+ "epoch": 1.29,
354
+ "learning_rate": 0.00013543543543543545,
355
+ "loss": 0.5445,
356
+ "step": 430
357
+ },
358
+ {
359
+ "epoch": 1.32,
360
+ "learning_rate": 0.00013393393393393394,
361
+ "loss": 0.6703,
362
+ "step": 440
363
+ },
364
+ {
365
+ "epoch": 1.32,
366
+ "eval_accuracy": 0.7988165680473372,
367
+ "eval_loss": 0.6957046985626221,
368
+ "eval_runtime": 25.2844,
369
+ "eval_samples_per_second": 60.156,
370
+ "eval_steps_per_second": 7.554,
371
+ "step": 440
372
+ },
373
+ {
374
+ "epoch": 1.35,
375
+ "learning_rate": 0.00013243243243243243,
376
+ "loss": 0.8127,
377
+ "step": 450
378
+ },
379
+ {
380
+ "epoch": 1.38,
381
+ "learning_rate": 0.00013093093093093094,
382
+ "loss": 0.567,
383
+ "step": 460
384
+ },
385
+ {
386
+ "epoch": 1.41,
387
+ "learning_rate": 0.00012942942942942943,
388
+ "loss": 0.5575,
389
+ "step": 470
390
+ },
391
+ {
392
+ "epoch": 1.44,
393
+ "learning_rate": 0.00012792792792792795,
394
+ "loss": 0.396,
395
+ "step": 480
396
+ },
397
+ {
398
+ "epoch": 1.44,
399
+ "eval_accuracy": 0.8428665351742275,
400
+ "eval_loss": 0.5446497797966003,
401
+ "eval_runtime": 25.6496,
402
+ "eval_samples_per_second": 59.299,
403
+ "eval_steps_per_second": 7.447,
404
+ "step": 480
405
+ },
406
+ {
407
+ "epoch": 1.47,
408
+ "learning_rate": 0.00012642642642642644,
409
+ "loss": 0.4975,
410
+ "step": 490
411
+ },
412
+ {
413
+ "epoch": 1.5,
414
+ "learning_rate": 0.00012492492492492492,
415
+ "loss": 0.3971,
416
+ "step": 500
417
+ },
418
+ {
419
+ "epoch": 1.53,
420
+ "learning_rate": 0.00012342342342342344,
421
+ "loss": 0.469,
422
+ "step": 510
423
+ },
424
+ {
425
+ "epoch": 1.56,
426
+ "learning_rate": 0.00012192192192192193,
427
+ "loss": 0.4277,
428
+ "step": 520
429
+ },
430
+ {
431
+ "epoch": 1.56,
432
+ "eval_accuracy": 0.8408941485864563,
433
+ "eval_loss": 0.5659409165382385,
434
+ "eval_runtime": 25.2836,
435
+ "eval_samples_per_second": 60.157,
436
+ "eval_steps_per_second": 7.554,
437
+ "step": 520
438
+ },
439
+ {
440
+ "epoch": 1.59,
441
+ "learning_rate": 0.00012042042042042043,
442
+ "loss": 0.4984,
443
+ "step": 530
444
+ },
445
+ {
446
+ "epoch": 1.62,
447
+ "learning_rate": 0.00011891891891891893,
448
+ "loss": 0.476,
449
+ "step": 540
450
+ },
451
+ {
452
+ "epoch": 1.65,
453
+ "learning_rate": 0.00011741741741741743,
454
+ "loss": 0.5402,
455
+ "step": 550
456
+ },
457
+ {
458
+ "epoch": 1.68,
459
+ "learning_rate": 0.00011591591591591592,
460
+ "loss": 0.457,
461
+ "step": 560
462
+ },
463
+ {
464
+ "epoch": 1.68,
465
+ "eval_accuracy": 0.8527284681130834,
466
+ "eval_loss": 0.5437894463539124,
467
+ "eval_runtime": 25.9766,
468
+ "eval_samples_per_second": 58.553,
469
+ "eval_steps_per_second": 7.353,
470
+ "step": 560
471
+ },
472
+ {
473
+ "epoch": 1.71,
474
+ "learning_rate": 0.00011441441441441443,
475
+ "loss": 0.3039,
476
+ "step": 570
477
+ },
478
+ {
479
+ "epoch": 1.74,
480
+ "learning_rate": 0.00011291291291291293,
481
+ "loss": 0.4621,
482
+ "step": 580
483
+ },
484
+ {
485
+ "epoch": 1.77,
486
+ "learning_rate": 0.00011141141141141143,
487
+ "loss": 0.4584,
488
+ "step": 590
489
+ },
490
+ {
491
+ "epoch": 1.8,
492
+ "learning_rate": 0.00010990990990990993,
493
+ "loss": 0.5632,
494
+ "step": 600
495
+ },
496
+ {
497
+ "epoch": 1.8,
498
+ "eval_accuracy": 0.8547008547008547,
499
+ "eval_loss": 0.49320539832115173,
500
+ "eval_runtime": 25.6823,
501
+ "eval_samples_per_second": 59.224,
502
+ "eval_steps_per_second": 7.437,
503
+ "step": 600
504
+ },
505
+ {
506
+ "epoch": 1.83,
507
+ "learning_rate": 0.00010840840840840842,
508
+ "loss": 0.4798,
509
+ "step": 610
510
+ },
511
+ {
512
+ "epoch": 1.86,
513
+ "learning_rate": 0.00010690690690690692,
514
+ "loss": 0.3991,
515
+ "step": 620
516
+ },
517
+ {
518
+ "epoch": 1.89,
519
+ "learning_rate": 0.0001054054054054054,
520
+ "loss": 0.4815,
521
+ "step": 630
522
+ },
523
+ {
524
+ "epoch": 1.92,
525
+ "learning_rate": 0.0001039039039039039,
526
+ "loss": 0.4066,
527
+ "step": 640
528
+ },
529
+ {
530
+ "epoch": 1.92,
531
+ "eval_accuracy": 0.8658777120315582,
532
+ "eval_loss": 0.4579479694366455,
533
+ "eval_runtime": 25.9931,
534
+ "eval_samples_per_second": 58.516,
535
+ "eval_steps_per_second": 7.348,
536
+ "step": 640
537
+ },
538
+ {
539
+ "epoch": 1.95,
540
+ "learning_rate": 0.0001024024024024024,
541
+ "loss": 0.5159,
542
+ "step": 650
543
+ },
544
+ {
545
+ "epoch": 1.98,
546
+ "learning_rate": 0.00010090090090090089,
547
+ "loss": 0.4886,
548
+ "step": 660
549
+ },
550
+ {
551
+ "epoch": 2.01,
552
+ "learning_rate": 9.93993993993994e-05,
553
+ "loss": 0.2916,
554
+ "step": 670
555
+ },
556
+ {
557
+ "epoch": 2.04,
558
+ "learning_rate": 9.789789789789791e-05,
559
+ "loss": 0.2505,
560
+ "step": 680
561
+ },
562
+ {
563
+ "epoch": 2.04,
564
+ "eval_accuracy": 0.8665351742274819,
565
+ "eval_loss": 0.4546054005622864,
566
+ "eval_runtime": 26.0755,
567
+ "eval_samples_per_second": 58.331,
568
+ "eval_steps_per_second": 7.325,
569
+ "step": 680
570
+ },
571
+ {
572
+ "epoch": 2.07,
573
+ "learning_rate": 9.639639639639641e-05,
574
+ "loss": 0.3156,
575
+ "step": 690
576
+ },
577
+ {
578
+ "epoch": 2.1,
579
+ "learning_rate": 9.48948948948949e-05,
580
+ "loss": 0.375,
581
+ "step": 700
582
+ },
583
+ {
584
+ "epoch": 2.13,
585
+ "learning_rate": 9.33933933933934e-05,
586
+ "loss": 0.2663,
587
+ "step": 710
588
+ },
589
+ {
590
+ "epoch": 2.16,
591
+ "learning_rate": 9.18918918918919e-05,
592
+ "loss": 0.3181,
593
+ "step": 720
594
+ },
595
+ {
596
+ "epoch": 2.16,
597
+ "eval_accuracy": 0.8586456278763971,
598
+ "eval_loss": 0.4709266424179077,
599
+ "eval_runtime": 26.2379,
600
+ "eval_samples_per_second": 57.97,
601
+ "eval_steps_per_second": 7.28,
602
+ "step": 720
603
+ },
604
+ {
605
+ "epoch": 2.19,
606
+ "learning_rate": 9.039039039039039e-05,
607
+ "loss": 0.2628,
608
+ "step": 730
609
+ },
610
+ {
611
+ "epoch": 2.22,
612
+ "learning_rate": 8.888888888888889e-05,
613
+ "loss": 0.1784,
614
+ "step": 740
615
+ },
616
+ {
617
+ "epoch": 2.25,
618
+ "learning_rate": 8.738738738738738e-05,
619
+ "loss": 0.357,
620
+ "step": 750
621
+ },
622
+ {
623
+ "epoch": 2.28,
624
+ "learning_rate": 8.588588588588588e-05,
625
+ "loss": 0.1931,
626
+ "step": 760
627
+ },
628
+ {
629
+ "epoch": 2.28,
630
+ "eval_accuracy": 0.8593030900723209,
631
+ "eval_loss": 0.45762282609939575,
632
+ "eval_runtime": 25.7501,
633
+ "eval_samples_per_second": 59.068,
634
+ "eval_steps_per_second": 7.417,
635
+ "step": 760
636
+ },
637
+ {
638
+ "epoch": 2.31,
639
+ "learning_rate": 8.438438438438439e-05,
640
+ "loss": 0.2865,
641
+ "step": 770
642
+ },
643
+ {
644
+ "epoch": 2.34,
645
+ "learning_rate": 8.288288288288289e-05,
646
+ "loss": 0.3125,
647
+ "step": 780
648
+ },
649
+ {
650
+ "epoch": 2.37,
651
+ "learning_rate": 8.138138138138138e-05,
652
+ "loss": 0.417,
653
+ "step": 790
654
+ },
655
+ {
656
+ "epoch": 2.4,
657
+ "learning_rate": 7.987987987987988e-05,
658
+ "loss": 0.288,
659
+ "step": 800
660
+ },
661
+ {
662
+ "epoch": 2.4,
663
+ "eval_accuracy": 0.8763971071663379,
664
+ "eval_loss": 0.41621989011764526,
665
+ "eval_runtime": 25.9323,
666
+ "eval_samples_per_second": 58.653,
667
+ "eval_steps_per_second": 7.365,
668
+ "step": 800
669
+ },
670
+ {
671
+ "epoch": 2.43,
672
+ "learning_rate": 7.837837837837838e-05,
673
+ "loss": 0.3603,
674
+ "step": 810
675
+ },
676
+ {
677
+ "epoch": 2.46,
678
+ "learning_rate": 7.687687687687688e-05,
679
+ "loss": 0.2386,
680
+ "step": 820
681
+ },
682
+ {
683
+ "epoch": 2.49,
684
+ "learning_rate": 7.537537537537538e-05,
685
+ "loss": 0.2035,
686
+ "step": 830
687
+ },
688
+ {
689
+ "epoch": 2.52,
690
+ "learning_rate": 7.387387387387387e-05,
691
+ "loss": 0.2315,
692
+ "step": 840
693
+ },
694
+ {
695
+ "epoch": 2.52,
696
+ "eval_accuracy": 0.8895463510848126,
697
+ "eval_loss": 0.3602614402770996,
698
+ "eval_runtime": 26.3037,
699
+ "eval_samples_per_second": 57.825,
700
+ "eval_steps_per_second": 7.261,
701
+ "step": 840
702
+ },
703
+ {
704
+ "epoch": 2.55,
705
+ "learning_rate": 7.237237237237238e-05,
706
+ "loss": 0.239,
707
+ "step": 850
708
+ },
709
+ {
710
+ "epoch": 2.58,
711
+ "learning_rate": 7.087087087087088e-05,
712
+ "loss": 0.2514,
713
+ "step": 860
714
+ },
715
+ {
716
+ "epoch": 2.61,
717
+ "learning_rate": 6.936936936936938e-05,
718
+ "loss": 0.3601,
719
+ "step": 870
720
+ },
721
+ {
722
+ "epoch": 2.64,
723
+ "learning_rate": 6.786786786786787e-05,
724
+ "loss": 0.1812,
725
+ "step": 880
726
+ },
727
+ {
728
+ "epoch": 2.64,
729
+ "eval_accuracy": 0.8823142669296515,
730
+ "eval_loss": 0.40812087059020996,
731
+ "eval_runtime": 25.5461,
732
+ "eval_samples_per_second": 59.539,
733
+ "eval_steps_per_second": 7.477,
734
+ "step": 880
735
+ },
736
+ {
737
+ "epoch": 2.67,
738
+ "learning_rate": 6.636636636636637e-05,
739
+ "loss": 0.2621,
740
+ "step": 890
741
+ },
742
+ {
743
+ "epoch": 2.7,
744
+ "learning_rate": 6.486486486486487e-05,
745
+ "loss": 0.2284,
746
+ "step": 900
747
+ },
748
+ {
749
+ "epoch": 2.73,
750
+ "learning_rate": 6.336336336336337e-05,
751
+ "loss": 0.1624,
752
+ "step": 910
753
+ },
754
+ {
755
+ "epoch": 2.76,
756
+ "learning_rate": 6.186186186186186e-05,
757
+ "loss": 0.2661,
758
+ "step": 920
759
+ },
760
+ {
761
+ "epoch": 2.76,
762
+ "eval_accuracy": 0.8777120315581854,
763
+ "eval_loss": 0.3957211375236511,
764
+ "eval_runtime": 26.3972,
765
+ "eval_samples_per_second": 57.62,
766
+ "eval_steps_per_second": 7.236,
767
+ "step": 920
768
+ },
769
+ {
770
+ "epoch": 2.79,
771
+ "learning_rate": 6.0360360360360365e-05,
772
+ "loss": 0.2434,
773
+ "step": 930
774
+ },
775
+ {
776
+ "epoch": 2.82,
777
+ "learning_rate": 5.8858858858858854e-05,
778
+ "loss": 0.2894,
779
+ "step": 940
780
+ },
781
+ {
782
+ "epoch": 2.85,
783
+ "learning_rate": 5.7357357357357356e-05,
784
+ "loss": 0.1421,
785
+ "step": 950
786
+ },
787
+ {
788
+ "epoch": 2.88,
789
+ "learning_rate": 5.585585585585585e-05,
790
+ "loss": 0.2632,
791
+ "step": 960
792
+ },
793
+ {
794
+ "epoch": 2.88,
795
+ "eval_accuracy": 0.8915187376725838,
796
+ "eval_loss": 0.34952232241630554,
797
+ "eval_runtime": 25.2354,
798
+ "eval_samples_per_second": 60.272,
799
+ "eval_steps_per_second": 7.569,
800
+ "step": 960
801
+ },
802
+ {
803
+ "epoch": 2.91,
804
+ "learning_rate": 5.435435435435435e-05,
805
+ "loss": 0.2474,
806
+ "step": 970
807
+ },
808
+ {
809
+ "epoch": 2.94,
810
+ "learning_rate": 5.2852852852852855e-05,
811
+ "loss": 0.2458,
812
+ "step": 980
813
+ },
814
+ {
815
+ "epoch": 2.97,
816
+ "learning_rate": 5.135135135135135e-05,
817
+ "loss": 0.2121,
818
+ "step": 990
819
+ },
820
+ {
821
+ "epoch": 3.0,
822
+ "learning_rate": 4.984984984984985e-05,
823
+ "loss": 0.215,
824
+ "step": 1000
825
+ },
826
+ {
827
+ "epoch": 3.0,
828
+ "eval_accuracy": 0.8915187376725838,
829
+ "eval_loss": 0.3721879720687866,
830
+ "eval_runtime": 26.0936,
831
+ "eval_samples_per_second": 58.29,
832
+ "eval_steps_per_second": 7.32,
833
+ "step": 1000
834
+ },
835
+ {
836
+ "epoch": 3.03,
837
+ "learning_rate": 4.834834834834835e-05,
838
+ "loss": 0.1788,
839
+ "step": 1010
840
+ },
841
+ {
842
+ "epoch": 3.06,
843
+ "learning_rate": 4.684684684684685e-05,
844
+ "loss": 0.1671,
845
+ "step": 1020
846
+ },
847
+ {
848
+ "epoch": 3.09,
849
+ "learning_rate": 4.5345345345345345e-05,
850
+ "loss": 0.1663,
851
+ "step": 1030
852
+ },
853
+ {
854
+ "epoch": 3.12,
855
+ "learning_rate": 4.384384384384385e-05,
856
+ "loss": 0.1661,
857
+ "step": 1040
858
+ },
859
+ {
860
+ "epoch": 3.12,
861
+ "eval_accuracy": 0.8888888888888888,
862
+ "eval_loss": 0.3638674318790436,
863
+ "eval_runtime": 25.7736,
864
+ "eval_samples_per_second": 59.014,
865
+ "eval_steps_per_second": 7.411,
866
+ "step": 1040
867
+ },
868
+ {
869
+ "epoch": 3.15,
870
+ "learning_rate": 4.234234234234234e-05,
871
+ "loss": 0.1734,
872
+ "step": 1050
873
+ },
874
+ {
875
+ "epoch": 3.18,
876
+ "learning_rate": 4.0840840840840845e-05,
877
+ "loss": 0.1525,
878
+ "step": 1060
879
+ },
880
+ {
881
+ "epoch": 3.21,
882
+ "learning_rate": 3.933933933933934e-05,
883
+ "loss": 0.1397,
884
+ "step": 1070
885
+ },
886
+ {
887
+ "epoch": 3.24,
888
+ "learning_rate": 3.783783783783784e-05,
889
+ "loss": 0.1164,
890
+ "step": 1080
891
+ },
892
+ {
893
+ "epoch": 3.24,
894
+ "eval_accuracy": 0.9013806706114399,
895
+ "eval_loss": 0.32475772500038147,
896
+ "eval_runtime": 25.7546,
897
+ "eval_samples_per_second": 59.057,
898
+ "eval_steps_per_second": 7.416,
899
+ "step": 1080
900
+ },
901
+ {
902
+ "epoch": 3.27,
903
+ "learning_rate": 3.633633633633634e-05,
904
+ "loss": 0.176,
905
+ "step": 1090
906
+ },
907
+ {
908
+ "epoch": 3.3,
909
+ "learning_rate": 3.483483483483483e-05,
910
+ "loss": 0.1248,
911
+ "step": 1100
912
+ },
913
+ {
914
+ "epoch": 3.33,
915
+ "learning_rate": 3.3333333333333335e-05,
916
+ "loss": 0.1025,
917
+ "step": 1110
918
+ },
919
+ {
920
+ "epoch": 3.36,
921
+ "learning_rate": 3.183183183183183e-05,
922
+ "loss": 0.1745,
923
+ "step": 1120
924
+ },
925
+ {
926
+ "epoch": 3.36,
927
+ "eval_accuracy": 0.8948060486522025,
928
+ "eval_loss": 0.33917945623397827,
929
+ "eval_runtime": 26.0362,
930
+ "eval_samples_per_second": 58.419,
931
+ "eval_steps_per_second": 7.336,
932
+ "step": 1120
933
+ },
934
+ {
935
+ "epoch": 3.39,
936
+ "learning_rate": 3.0330330330330332e-05,
937
+ "loss": 0.2151,
938
+ "step": 1130
939
+ },
940
+ {
941
+ "epoch": 3.42,
942
+ "learning_rate": 2.882882882882883e-05,
943
+ "loss": 0.1135,
944
+ "step": 1140
945
+ },
946
+ {
947
+ "epoch": 3.45,
948
+ "learning_rate": 2.732732732732733e-05,
949
+ "loss": 0.1307,
950
+ "step": 1150
951
+ },
952
+ {
953
+ "epoch": 3.48,
954
+ "learning_rate": 2.582582582582583e-05,
955
+ "loss": 0.1347,
956
+ "step": 1160
957
+ },
958
+ {
959
+ "epoch": 3.48,
960
+ "eval_accuracy": 0.8980933596318211,
961
+ "eval_loss": 0.33266809582710266,
962
+ "eval_runtime": 25.5437,
963
+ "eval_samples_per_second": 59.545,
964
+ "eval_steps_per_second": 7.477,
965
+ "step": 1160
966
+ },
967
+ {
968
+ "epoch": 3.51,
969
+ "learning_rate": 2.4324324324324327e-05,
970
+ "loss": 0.1395,
971
+ "step": 1170
972
+ },
973
+ {
974
+ "epoch": 3.54,
975
+ "learning_rate": 2.2822822822822822e-05,
976
+ "loss": 0.1506,
977
+ "step": 1180
978
+ },
979
+ {
980
+ "epoch": 3.57,
981
+ "learning_rate": 2.132132132132132e-05,
982
+ "loss": 0.1833,
983
+ "step": 1190
984
+ },
985
+ {
986
+ "epoch": 3.6,
987
+ "learning_rate": 1.981981981981982e-05,
988
+ "loss": 0.1362,
989
+ "step": 1200
990
+ },
991
+ {
992
+ "epoch": 3.6,
993
+ "eval_accuracy": 0.9105851413543721,
994
+ "eval_loss": 0.3018592596054077,
995
+ "eval_runtime": 26.0978,
996
+ "eval_samples_per_second": 58.281,
997
+ "eval_steps_per_second": 7.319,
998
+ "step": 1200
999
+ },
1000
+ {
1001
+ "epoch": 3.63,
1002
+ "learning_rate": 1.831831831831832e-05,
1003
+ "loss": 0.1402,
1004
+ "step": 1210
1005
+ },
1006
+ {
1007
+ "epoch": 3.66,
1008
+ "learning_rate": 1.6816816816816817e-05,
1009
+ "loss": 0.142,
1010
+ "step": 1220
1011
+ },
1012
+ {
1013
+ "epoch": 3.69,
1014
+ "learning_rate": 1.5315315315315316e-05,
1015
+ "loss": 0.1422,
1016
+ "step": 1230
1017
+ },
1018
+ {
1019
+ "epoch": 3.72,
1020
+ "learning_rate": 1.3813813813813815e-05,
1021
+ "loss": 0.127,
1022
+ "step": 1240
1023
+ },
1024
+ {
1025
+ "epoch": 3.72,
1026
+ "eval_accuracy": 0.9072978303747534,
1027
+ "eval_loss": 0.3087291419506073,
1028
+ "eval_runtime": 25.0476,
1029
+ "eval_samples_per_second": 60.724,
1030
+ "eval_steps_per_second": 7.625,
1031
+ "step": 1240
1032
+ },
1033
+ {
1034
+ "epoch": 3.75,
1035
+ "learning_rate": 1.2312312312312313e-05,
1036
+ "loss": 0.1435,
1037
+ "step": 1250
1038
+ },
1039
+ {
1040
+ "epoch": 3.78,
1041
+ "learning_rate": 1.0810810810810812e-05,
1042
+ "loss": 0.1236,
1043
+ "step": 1260
1044
+ },
1045
+ {
1046
+ "epoch": 3.81,
1047
+ "learning_rate": 9.309309309309309e-06,
1048
+ "loss": 0.1167,
1049
+ "step": 1270
1050
+ },
1051
+ {
1052
+ "epoch": 3.84,
1053
+ "learning_rate": 7.807807807807808e-06,
1054
+ "loss": 0.1041,
1055
+ "step": 1280
1056
+ },
1057
+ {
1058
+ "epoch": 3.84,
1059
+ "eval_accuracy": 0.9099276791584484,
1060
+ "eval_loss": 0.2977927625179291,
1061
+ "eval_runtime": 26.1338,
1062
+ "eval_samples_per_second": 58.2,
1063
+ "eval_steps_per_second": 7.309,
1064
+ "step": 1280
1065
+ },
1066
+ {
1067
+ "epoch": 3.87,
1068
+ "learning_rate": 6.306306306306306e-06,
1069
+ "loss": 0.1486,
1070
+ "step": 1290
1071
+ },
1072
+ {
1073
+ "epoch": 3.9,
1074
+ "learning_rate": 4.804804804804805e-06,
1075
+ "loss": 0.1316,
1076
+ "step": 1300
1077
+ },
1078
+ {
1079
+ "epoch": 3.93,
1080
+ "learning_rate": 3.3033033033033035e-06,
1081
+ "loss": 0.1035,
1082
+ "step": 1310
1083
+ },
1084
+ {
1085
+ "epoch": 3.96,
1086
+ "learning_rate": 1.801801801801802e-06,
1087
+ "loss": 0.106,
1088
+ "step": 1320
1089
+ },
1090
+ {
1091
+ "epoch": 3.96,
1092
+ "eval_accuracy": 0.9112426035502958,
1093
+ "eval_loss": 0.2991219162940979,
1094
+ "eval_runtime": 25.2949,
1095
+ "eval_samples_per_second": 60.131,
1096
+ "eval_steps_per_second": 7.551,
1097
+ "step": 1320
1098
+ }
1099
+ ],
1100
+ "max_steps": 1332,
1101
+ "num_train_epochs": 4,
1102
+ "total_flos": 1.6343682786726543e+18,
1103
+ "trial_name": null,
1104
+ "trial_params": null
1105
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c889fe7ac0f743a0537f10a56930a278af37a45454b9b3c7caf790fe6c39851
3
+ size 3963