OmAlve commited on
Commit
26a9a86
·
verified ·
1 Parent(s): f7edb1e

fixed labeling bug

Browse files
README.md CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on the pcuenq/oxford-pets dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.0046
22
- - Accuracy: 0.9989
23
 
24
  ## Model description
25
 
@@ -39,8 +39,8 @@ More information needed
39
 
40
  The following hyperparameters were used during training:
41
  - learning_rate: 0.0003
42
- - train_batch_size: 128
43
- - eval_batch_size: 16
44
  - seed: 42
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: linear
@@ -51,13 +51,15 @@ The following hyperparameters were used during training:
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
54
- | 0.0887 | 1.72 | 100 | 0.0765 | 0.9774 |
55
- | 0.0045 | 3.45 | 200 | 0.0046 | 0.9989 |
 
 
56
 
57
 
58
  ### Framework versions
59
 
60
  - Transformers 4.39.2
61
- - Pytorch 2.1.2
62
  - Datasets 2.16.0
63
  - Tokenizers 0.15.2
 
18
 
19
  This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on the pcuenq/oxford-pets dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.0058
22
+ - Accuracy: 0.9988
23
 
24
  ## Model description
25
 
 
39
 
40
  The following hyperparameters were used during training:
41
  - learning_rate: 0.0003
42
+ - train_batch_size: 64
43
+ - eval_batch_size: 8
44
  - seed: 42
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: linear
 
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
54
+ | 0.3713 | 0.86 | 100 | 0.2084 | 0.9307 |
55
+ | 0.1173 | 1.72 | 200 | 0.0774 | 0.9763 |
56
+ | 0.0612 | 2.59 | 300 | 0.0212 | 0.9947 |
57
+ | 0.007 | 3.45 | 400 | 0.0058 | 0.9988 |
58
 
59
 
60
  ### Framework versions
61
 
62
  - Transformers 4.39.2
63
+ - Pytorch 2.2.1+cu121
64
  - Datasets 2.16.0
65
  - Tokenizers 0.15.2
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 2.2913817801515827e+18,
4
- "train_loss": 0.202088082896682,
5
- "train_runtime": 854.7685,
6
- "train_samples_per_second": 34.582,
7
- "train_steps_per_second": 0.271
8
  }
 
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 2.2913817801515827e+18,
4
+ "train_loss": 0.19263494449491003,
5
+ "train_runtime": 813.2364,
6
+ "train_samples_per_second": 36.349,
7
+ "train_steps_per_second": 0.571
8
  }
config.json CHANGED
@@ -9,85 +9,85 @@
9
  "hidden_dropout_prob": 0.0,
10
  "hidden_size": 768,
11
  "id2label": {
12
- "0": "pug Dog",
13
- "1": "japanese chin Dog",
14
- "10": "Ragdoll Cat",
15
- "11": "scottish terrier Dog",
16
- "12": "shiba inu Dog",
17
- "13": "chihuahua Dog",
18
- "14": "samoyed Dog",
19
- "15": "Maine Coon Cat",
20
- "16": "newfoundland Dog",
21
- "17": "Abyssinian Cat",
22
- "18": "saint bernard Dog",
23
- "19": "Persian Cat",
24
- "2": "american pit bull terrier Dog",
25
- "20": "american bulldog Dog",
26
- "21": "boxer Dog",
27
- "22": "leonberger Dog",
28
- "23": "german shorthaired Dog",
29
- "24": "staffordshire bull terrier Dog",
30
- "25": "Birman Cat",
31
- "26": "english cocker spaniel Dog",
32
- "27": "english setter Dog",
33
- "28": "Siamese Cat",
34
- "29": "Sphynx Cat",
35
- "3": "beagle Dog",
36
- "30": "Bombay Cat",
37
- "31": "keeshond Dog",
38
- "32": "havanese Dog",
39
- "33": "Bengal Cat",
40
- "34": "great pyrenees Dog",
41
- "35": "Russian Blue Cat",
42
- "36": "basset hound Dog",
43
- "4": "miniature pinscher Dog",
44
- "5": "pomeranian Dog",
45
- "6": "yorkshire terrier Dog",
46
- "7": "Egyptian Mau Cat",
47
- "8": "British Shorthair Cat",
48
- "9": "wheaten terrier Dog"
49
  },
50
  "image_size": 224,
51
  "initializer_range": 0.02,
52
  "intermediate_size": 3072,
53
  "label2id": {
54
- "Abyssinian Cat": "17",
55
- "Bengal Cat": "33",
56
- "Birman Cat": "25",
57
- "Bombay Cat": "30",
58
- "British Shorthair Cat": "8",
59
- "Egyptian Mau Cat": "7",
60
- "Maine Coon Cat": "15",
61
- "Persian Cat": "19",
62
- "Ragdoll Cat": "10",
63
- "Russian Blue Cat": "35",
64
- "Siamese Cat": "28",
65
- "Sphynx Cat": "29",
66
- "american bulldog Dog": "20",
67
- "american pit bull terrier Dog": "2",
68
- "basset hound Dog": "36",
69
- "beagle Dog": "3",
70
- "boxer Dog": "21",
71
- "chihuahua Dog": "13",
72
- "english cocker spaniel Dog": "26",
73
- "english setter Dog": "27",
74
- "german shorthaired Dog": "23",
75
- "great pyrenees Dog": "34",
76
- "havanese Dog": "32",
77
- "japanese chin Dog": "1",
78
- "keeshond Dog": "31",
79
- "leonberger Dog": "22",
80
- "miniature pinscher Dog": "4",
81
- "newfoundland Dog": "16",
82
- "pomeranian Dog": "5",
83
- "pug Dog": "0",
84
- "saint bernard Dog": "18",
85
- "samoyed Dog": "14",
86
- "scottish terrier Dog": "11",
87
- "shiba inu Dog": "12",
88
- "staffordshire bull terrier Dog": "24",
89
- "wheaten terrier Dog": "9",
90
- "yorkshire terrier Dog": "6"
91
  },
92
  "layer_norm_eps": 1e-12,
93
  "model_type": "vit",
 
9
  "hidden_dropout_prob": 0.0,
10
  "hidden_size": 768,
11
  "id2label": {
12
+ "0": "saint bernard dog",
13
+ "1": "Ragdoll cat",
14
+ "10": "keeshond dog",
15
+ "11": "english cocker spaniel dog",
16
+ "12": "beagle dog",
17
+ "13": "Russian Blue cat",
18
+ "14": "scottish terrier dog",
19
+ "15": "newfoundland dog",
20
+ "16": "Bombay cat",
21
+ "17": "Bengal cat",
22
+ "18": "japanese chin dog",
23
+ "19": "Sphynx cat",
24
+ "2": "havanese dog",
25
+ "20": "Persian cat",
26
+ "21": "shiba inu dog",
27
+ "22": "english setter dog",
28
+ "23": "great pyrenees dog",
29
+ "24": "chihuahua dog",
30
+ "25": "miniature pinscher dog",
31
+ "26": "pomeranian dog",
32
+ "27": "Abyssinian cat",
33
+ "28": "basset hound dog",
34
+ "29": "Birman cat",
35
+ "3": "Egyptian Mau cat",
36
+ "30": "german shorthaired dog",
37
+ "31": "american bulldog dog",
38
+ "32": "american pit bull terrier dog",
39
+ "33": "staffordshire bull terrier dog",
40
+ "34": "Maine Coon cat",
41
+ "35": "samoyed dog",
42
+ "36": "boxer dog",
43
+ "4": "Siamese cat",
44
+ "5": "yorkshire terrier dog",
45
+ "6": "leonberger dog",
46
+ "7": "pug dog",
47
+ "8": "British Shorthair cat",
48
+ "9": "wheaten terrier dog"
49
  },
50
  "image_size": 224,
51
  "initializer_range": 0.02,
52
  "intermediate_size": 3072,
53
  "label2id": {
54
+ "Abyssinian cat": "27",
55
+ "Bengal cat": "17",
56
+ "Birman cat": "29",
57
+ "Bombay cat": "16",
58
+ "British Shorthair cat": "8",
59
+ "Egyptian Mau cat": "3",
60
+ "Maine Coon cat": "34",
61
+ "Persian cat": "20",
62
+ "Ragdoll cat": "1",
63
+ "Russian Blue cat": "13",
64
+ "Siamese cat": "4",
65
+ "Sphynx cat": "19",
66
+ "american bulldog dog": "31",
67
+ "american pit bull terrier dog": "32",
68
+ "basset hound dog": "28",
69
+ "beagle dog": "12",
70
+ "boxer dog": "36",
71
+ "chihuahua dog": "24",
72
+ "english cocker spaniel dog": "11",
73
+ "english setter dog": "22",
74
+ "german shorthaired dog": "30",
75
+ "great pyrenees dog": "23",
76
+ "havanese dog": "2",
77
+ "japanese chin dog": "18",
78
+ "keeshond dog": "10",
79
+ "leonberger dog": "6",
80
+ "miniature pinscher dog": "25",
81
+ "newfoundland dog": "15",
82
+ "pomeranian dog": "26",
83
+ "pug dog": "7",
84
+ "saint bernard dog": "0",
85
+ "samoyed dog": "35",
86
+ "scottish terrier dog": "14",
87
+ "shiba inu dog": "21",
88
+ "staffordshire bull terrier dog": "33",
89
+ "wheaten terrier dog": "9",
90
+ "yorkshire terrier dog": "5"
91
  },
92
  "layer_norm_eps": 1e-12,
93
  "model_type": "vit",
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:04c848cc43804867db29602b8504492e7669c8936edf4fab2780cde439ea8afb
3
  size 343331644
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f4f4ab3ed13788b1a113a26071c629101eaba9b3e315e4735a8da6b713924ef
3
  size 343331644
runs/Mar30_12-24-34_837f7dd663e8/events.out.tfevents.1711801475.837f7dd663e8.302.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8924dd3a6ae61d904ce53789aa448b46fef86f8445ba5c9b93855ead830a4a1a
3
+ size 17991
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 2.2913817801515827e+18,
4
- "train_loss": 0.202088082896682,
5
- "train_runtime": 854.7685,
6
- "train_samples_per_second": 34.582,
7
- "train_steps_per_second": 0.271
8
  }
 
1
  {
2
  "epoch": 4.0,
3
  "total_flos": 2.2913817801515827e+18,
4
+ "train_loss": 0.19263494449491003,
5
+ "train_runtime": 813.2364,
6
+ "train_samples_per_second": 36.349,
7
+ "train_steps_per_second": 0.571
8
  }
trainer_state.json CHANGED
@@ -1,209 +1,388 @@
1
  {
2
- "best_metric": 0.004596503917127848,
3
- "best_model_checkpoint": "./vit-base-pets/checkpoint-200",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
6
- "global_step": 232,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.17,
13
- "grad_norm": 168186.984375,
14
  "learning_rate": 0.0002870689655172413,
15
- "loss": 2.1401,
16
- "step": 10
 
 
 
 
 
 
 
17
  },
18
  {
19
  "epoch": 0.34,
20
- "grad_norm": 134643.921875,
21
  "learning_rate": 0.0002741379310344827,
22
- "loss": 0.5937,
23
- "step": 20
 
 
 
 
 
 
 
24
  },
25
  {
26
  "epoch": 0.52,
27
- "grad_norm": 117247.1015625,
28
  "learning_rate": 0.00026120689655172413,
29
- "loss": 0.3283,
30
- "step": 30
 
 
 
 
 
 
 
31
  },
32
  {
33
  "epoch": 0.69,
34
- "grad_norm": 148088.53125,
35
  "learning_rate": 0.0002482758620689655,
36
- "loss": 0.3447,
37
- "step": 40
 
 
 
 
 
 
 
38
  },
39
  {
40
  "epoch": 0.86,
41
- "grad_norm": 118186.203125,
42
  "learning_rate": 0.00023534482758620685,
43
- "loss": 0.2972,
44
- "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  },
46
  {
47
  "epoch": 1.03,
48
- "grad_norm": 89050.390625,
49
  "learning_rate": 0.00022241379310344826,
50
- "loss": 0.2425,
51
- "step": 60
 
 
 
 
 
 
 
52
  },
53
  {
54
  "epoch": 1.21,
55
- "grad_norm": 50770.4296875,
56
  "learning_rate": 0.00020948275862068963,
57
- "loss": 0.1025,
58
- "step": 70
 
 
 
 
 
 
 
59
  },
60
  {
61
  "epoch": 1.38,
62
- "grad_norm": 102693.65625,
63
  "learning_rate": 0.000196551724137931,
64
- "loss": 0.0998,
65
- "step": 80
 
 
 
 
 
 
 
66
  },
67
  {
68
  "epoch": 1.55,
69
- "grad_norm": 58018.40234375,
70
  "learning_rate": 0.0001836206896551724,
71
- "loss": 0.1048,
72
- "step": 90
 
 
 
 
 
 
 
73
  },
74
  {
75
  "epoch": 1.72,
76
- "grad_norm": 51126.80078125,
77
  "learning_rate": 0.0001706896551724138,
78
- "loss": 0.0887,
79
- "step": 100
80
  },
81
  {
82
  "epoch": 1.72,
83
- "eval_accuracy": 0.9774018944519621,
84
- "eval_loss": 0.07652416825294495,
85
- "eval_runtime": 90.7155,
86
- "eval_samples_per_second": 81.463,
87
- "eval_steps_per_second": 5.093,
88
- "step": 100
 
 
 
 
 
 
 
89
  },
90
  {
91
  "epoch": 1.9,
92
- "grad_norm": 86962.4921875,
93
  "learning_rate": 0.00015775862068965517,
94
- "loss": 0.0926,
95
- "step": 110
 
 
 
 
 
 
 
96
  },
97
  {
98
  "epoch": 2.07,
99
- "grad_norm": 72358.4609375,
100
  "learning_rate": 0.00014482758620689654,
101
- "loss": 0.0706,
102
- "step": 120
 
 
 
 
 
 
 
103
  },
104
  {
105
  "epoch": 2.24,
106
- "grad_norm": 20616.7109375,
107
  "learning_rate": 0.00013189655172413792,
108
- "loss": 0.039,
109
- "step": 130
 
 
 
 
 
 
 
110
  },
111
  {
112
  "epoch": 2.41,
113
- "grad_norm": 70676.0625,
114
  "learning_rate": 0.00011896551724137931,
115
- "loss": 0.0351,
116
- "step": 140
 
 
 
 
 
 
 
117
  },
118
  {
119
  "epoch": 2.59,
120
- "grad_norm": 12601.1337890625,
121
  "learning_rate": 0.00010603448275862067,
122
- "loss": 0.0263,
123
- "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  },
125
  {
126
  "epoch": 2.76,
127
- "grad_norm": 77678.21875,
128
  "learning_rate": 9.310344827586206e-05,
129
- "loss": 0.0296,
130
- "step": 160
 
 
 
 
 
 
 
131
  },
132
  {
133
  "epoch": 2.93,
134
- "grad_norm": 36381.59765625,
135
  "learning_rate": 8.017241379310344e-05,
136
- "loss": 0.025,
137
- "step": 170
 
 
 
 
 
 
 
138
  },
139
  {
140
  "epoch": 3.1,
141
- "grad_norm": 12375.634765625,
142
  "learning_rate": 6.724137931034483e-05,
143
- "loss": 0.0087,
144
- "step": 180
 
 
 
 
 
 
 
145
  },
146
  {
147
  "epoch": 3.28,
148
- "grad_norm": 1895.7933349609375,
149
  "learning_rate": 5.4310344827586204e-05,
150
- "loss": 0.0043,
151
- "step": 190
 
 
 
 
 
 
 
152
  },
153
  {
154
  "epoch": 3.45,
155
- "grad_norm": 49621.76171875,
156
  "learning_rate": 4.137931034482758e-05,
157
- "loss": 0.0045,
158
- "step": 200
159
  },
160
  {
161
  "epoch": 3.45,
162
- "eval_accuracy": 0.9989174560216508,
163
- "eval_loss": 0.004596503917127848,
164
- "eval_runtime": 91.6398,
165
- "eval_samples_per_second": 80.642,
166
- "eval_steps_per_second": 5.041,
167
- "step": 200
 
 
 
 
 
 
 
168
  },
169
  {
170
  "epoch": 3.62,
171
- "grad_norm": 798.2269287109375,
172
  "learning_rate": 2.8448275862068963e-05,
173
- "loss": 0.0045,
174
- "step": 210
 
 
 
 
 
 
 
175
  },
176
  {
177
  "epoch": 3.79,
178
- "grad_norm": 2413.81640625,
179
  "learning_rate": 1.5517241379310342e-05,
180
- "loss": 0.0034,
181
- "step": 220
 
 
 
 
 
 
 
182
  },
183
  {
184
  "epoch": 3.97,
185
- "grad_norm": 908.8674926757812,
186
  "learning_rate": 2.5862068965517237e-06,
187
- "loss": 0.0021,
188
- "step": 230
189
  },
190
  {
191
  "epoch": 4.0,
192
- "step": 232,
193
  "total_flos": 2.2913817801515827e+18,
194
- "train_loss": 0.202088082896682,
195
- "train_runtime": 854.7685,
196
- "train_samples_per_second": 34.582,
197
- "train_steps_per_second": 0.271
198
  }
199
  ],
200
  "logging_steps": 10,
201
- "max_steps": 232,
202
  "num_input_tokens_seen": 0,
203
  "num_train_epochs": 4,
204
  "save_steps": 100,
205
  "total_flos": 2.2913817801515827e+18,
206
- "train_batch_size": 128,
207
  "trial_name": null,
208
  "trial_params": null
209
  }
 
1
  {
2
+ "best_metric": 0.005833666305989027,
3
+ "best_model_checkpoint": "./vit-base-pets/checkpoint-400",
4
  "epoch": 4.0,
5
  "eval_steps": 100,
6
+ "global_step": 464,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
+ {
12
+ "epoch": 0.09,
13
+ "grad_norm": 3.28558611869812,
14
+ "learning_rate": 0.00029353448275862065,
15
+ "loss": 2.365,
16
+ "step": 10
17
+ },
18
  {
19
  "epoch": 0.17,
20
+ "grad_norm": 2.781508684158325,
21
  "learning_rate": 0.0002870689655172413,
22
+ "loss": 0.7805,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.26,
27
+ "grad_norm": 2.789832353591919,
28
+ "learning_rate": 0.00028060344827586205,
29
+ "loss": 0.4669,
30
+ "step": 30
31
  },
32
  {
33
  "epoch": 0.34,
34
+ "grad_norm": 3.0019114017486572,
35
  "learning_rate": 0.0002741379310344827,
36
+ "loss": 0.4777,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.43,
41
+ "grad_norm": 2.8678109645843506,
42
+ "learning_rate": 0.00026767241379310345,
43
+ "loss": 0.3825,
44
+ "step": 50
45
  },
46
  {
47
  "epoch": 0.52,
48
+ "grad_norm": 1.979690432548523,
49
  "learning_rate": 0.00026120689655172413,
50
+ "loss": 0.3924,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.6,
55
+ "grad_norm": 1.9895226955413818,
56
+ "learning_rate": 0.0002547413793103448,
57
+ "loss": 0.307,
58
+ "step": 70
59
  },
60
  {
61
  "epoch": 0.69,
62
+ "grad_norm": 2.970583438873291,
63
  "learning_rate": 0.0002482758620689655,
64
+ "loss": 0.4071,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.78,
69
+ "grad_norm": 3.0911920070648193,
70
+ "learning_rate": 0.00024181034482758618,
71
+ "loss": 0.3594,
72
+ "step": 90
73
  },
74
  {
75
  "epoch": 0.86,
76
+ "grad_norm": 2.5045769214630127,
77
  "learning_rate": 0.00023534482758620685,
78
+ "loss": 0.3713,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.86,
83
+ "eval_accuracy": 0.9307171853856563,
84
+ "eval_loss": 0.20842242240905762,
85
+ "eval_runtime": 68.2075,
86
+ "eval_samples_per_second": 108.346,
87
+ "eval_steps_per_second": 13.547,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.95,
92
+ "grad_norm": 1.4575306177139282,
93
+ "learning_rate": 0.00022887931034482758,
94
+ "loss": 0.3301,
95
+ "step": 110
96
  },
97
  {
98
  "epoch": 1.03,
99
+ "grad_norm": 1.9479578733444214,
100
  "learning_rate": 0.00022241379310344826,
101
+ "loss": 0.2504,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 1.12,
106
+ "grad_norm": 1.73411226272583,
107
+ "learning_rate": 0.00021594827586206896,
108
+ "loss": 0.1697,
109
+ "step": 130
110
  },
111
  {
112
  "epoch": 1.21,
113
+ "grad_norm": 1.672253966331482,
114
  "learning_rate": 0.00020948275862068963,
115
+ "loss": 0.116,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 1.29,
120
+ "grad_norm": 2.3492820262908936,
121
+ "learning_rate": 0.00020301724137931034,
122
+ "loss": 0.158,
123
+ "step": 150
124
  },
125
  {
126
  "epoch": 1.38,
127
+ "grad_norm": 1.4678938388824463,
128
  "learning_rate": 0.000196551724137931,
129
+ "loss": 0.1487,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 1.47,
134
+ "grad_norm": 2.1428756713867188,
135
+ "learning_rate": 0.00019008620689655169,
136
+ "loss": 0.1121,
137
+ "step": 170
138
  },
139
  {
140
  "epoch": 1.55,
141
+ "grad_norm": 1.255344271659851,
142
  "learning_rate": 0.0001836206896551724,
143
+ "loss": 0.1548,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 1.64,
148
+ "grad_norm": 1.692832350730896,
149
+ "learning_rate": 0.0001771551724137931,
150
+ "loss": 0.0916,
151
+ "step": 190
152
  },
153
  {
154
  "epoch": 1.72,
155
+ "grad_norm": 2.322737693786621,
156
  "learning_rate": 0.0001706896551724138,
157
+ "loss": 0.1173,
158
+ "step": 200
159
  },
160
  {
161
  "epoch": 1.72,
162
+ "eval_accuracy": 0.976319350473613,
163
+ "eval_loss": 0.07739943265914917,
164
+ "eval_runtime": 67.8967,
165
+ "eval_samples_per_second": 108.842,
166
+ "eval_steps_per_second": 13.609,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 1.81,
171
+ "grad_norm": 1.99238920211792,
172
+ "learning_rate": 0.00016422413793103446,
173
+ "loss": 0.1311,
174
+ "step": 210
175
  },
176
  {
177
  "epoch": 1.9,
178
+ "grad_norm": 2.3652477264404297,
179
  "learning_rate": 0.00015775862068965517,
180
+ "loss": 0.1114,
181
+ "step": 220
182
+ },
183
+ {
184
+ "epoch": 1.98,
185
+ "grad_norm": 1.3925710916519165,
186
+ "learning_rate": 0.00015129310344827584,
187
+ "loss": 0.1235,
188
+ "step": 230
189
  },
190
  {
191
  "epoch": 2.07,
192
+ "grad_norm": 2.0290815830230713,
193
  "learning_rate": 0.00014482758620689654,
194
+ "loss": 0.0772,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 2.16,
199
+ "grad_norm": 2.4121060371398926,
200
+ "learning_rate": 0.00013836206896551724,
201
+ "loss": 0.0715,
202
+ "step": 250
203
  },
204
  {
205
  "epoch": 2.24,
206
+ "grad_norm": 0.9658297300338745,
207
  "learning_rate": 0.00013189655172413792,
208
+ "loss": 0.0444,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 2.33,
213
+ "grad_norm": 0.24860858917236328,
214
+ "learning_rate": 0.0001254310344827586,
215
+ "loss": 0.0668,
216
+ "step": 270
217
  },
218
  {
219
  "epoch": 2.41,
220
+ "grad_norm": 1.50627601146698,
221
  "learning_rate": 0.00011896551724137931,
222
+ "loss": 0.034,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 2.5,
227
+ "grad_norm": 1.2053415775299072,
228
+ "learning_rate": 0.0001125,
229
+ "loss": 0.0489,
230
+ "step": 290
231
  },
232
  {
233
  "epoch": 2.59,
234
+ "grad_norm": 1.2974027395248413,
235
  "learning_rate": 0.00010603448275862067,
236
+ "loss": 0.0612,
237
+ "step": 300
238
+ },
239
+ {
240
+ "epoch": 2.59,
241
+ "eval_accuracy": 0.9947225981055481,
242
+ "eval_loss": 0.021214015781879425,
243
+ "eval_runtime": 67.3034,
244
+ "eval_samples_per_second": 109.801,
245
+ "eval_steps_per_second": 13.729,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 2.67,
250
+ "grad_norm": 0.22053079307079315,
251
+ "learning_rate": 9.956896551724137e-05,
252
+ "loss": 0.0308,
253
+ "step": 310
254
  },
255
  {
256
  "epoch": 2.76,
257
+ "grad_norm": 0.8180058002471924,
258
  "learning_rate": 9.310344827586206e-05,
259
+ "loss": 0.03,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 2.84,
264
+ "grad_norm": 1.5855587720870972,
265
+ "learning_rate": 8.663793103448275e-05,
266
+ "loss": 0.0235,
267
+ "step": 330
268
  },
269
  {
270
  "epoch": 2.93,
271
+ "grad_norm": 1.7537671327590942,
272
  "learning_rate": 8.017241379310344e-05,
273
+ "loss": 0.0225,
274
+ "step": 340
275
+ },
276
+ {
277
+ "epoch": 3.02,
278
+ "grad_norm": 0.10956920683383942,
279
+ "learning_rate": 7.370689655172413e-05,
280
+ "loss": 0.0156,
281
+ "step": 350
282
  },
283
  {
284
  "epoch": 3.1,
285
+ "grad_norm": 1.1361974477767944,
286
  "learning_rate": 6.724137931034483e-05,
287
+ "loss": 0.0094,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 3.19,
292
+ "grad_norm": 0.9139267802238464,
293
+ "learning_rate": 6.077586206896551e-05,
294
+ "loss": 0.0128,
295
+ "step": 370
296
  },
297
  {
298
  "epoch": 3.28,
299
+ "grad_norm": 0.12278908491134644,
300
  "learning_rate": 5.4310344827586204e-05,
301
+ "loss": 0.0087,
302
+ "step": 380
303
+ },
304
+ {
305
+ "epoch": 3.36,
306
+ "grad_norm": 0.0675448328256607,
307
+ "learning_rate": 4.78448275862069e-05,
308
+ "loss": 0.0048,
309
+ "step": 390
310
  },
311
  {
312
  "epoch": 3.45,
313
+ "grad_norm": 0.05717047303915024,
314
  "learning_rate": 4.137931034482758e-05,
315
+ "loss": 0.007,
316
+ "step": 400
317
  },
318
  {
319
  "epoch": 3.45,
320
+ "eval_accuracy": 0.9987821380243572,
321
+ "eval_loss": 0.005833666305989027,
322
+ "eval_runtime": 68.7353,
323
+ "eval_samples_per_second": 107.514,
324
+ "eval_steps_per_second": 13.443,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 3.53,
329
+ "grad_norm": 0.04572397843003273,
330
+ "learning_rate": 3.4913793103448275e-05,
331
+ "loss": 0.0051,
332
+ "step": 410
333
  },
334
  {
335
  "epoch": 3.62,
336
+ "grad_norm": 0.16931650042533875,
337
  "learning_rate": 2.8448275862068963e-05,
338
+ "loss": 0.006,
339
+ "step": 420
340
+ },
341
+ {
342
+ "epoch": 3.71,
343
+ "grad_norm": 0.041402418166399,
344
+ "learning_rate": 2.198275862068965e-05,
345
+ "loss": 0.0054,
346
+ "step": 430
347
  },
348
  {
349
  "epoch": 3.79,
350
+ "grad_norm": 0.02362515777349472,
351
  "learning_rate": 1.5517241379310342e-05,
352
+ "loss": 0.0086,
353
+ "step": 440
354
+ },
355
+ {
356
+ "epoch": 3.88,
357
+ "grad_norm": 0.3355587422847748,
358
+ "learning_rate": 9.051724137931034e-06,
359
+ "loss": 0.0039,
360
+ "step": 450
361
  },
362
  {
363
  "epoch": 3.97,
364
+ "grad_norm": 0.03504301235079765,
365
  "learning_rate": 2.5862068965517237e-06,
366
+ "loss": 0.0135,
367
+ "step": 460
368
  },
369
  {
370
  "epoch": 4.0,
371
+ "step": 464,
372
  "total_flos": 2.2913817801515827e+18,
373
+ "train_loss": 0.19263494449491003,
374
+ "train_runtime": 813.2364,
375
+ "train_samples_per_second": 36.349,
376
+ "train_steps_per_second": 0.571
377
  }
378
  ],
379
  "logging_steps": 10,
380
+ "max_steps": 464,
381
  "num_input_tokens_seen": 0,
382
  "num_train_epochs": 4,
383
  "save_steps": 100,
384
  "total_flos": 2.2913817801515827e+18,
385
+ "train_batch_size": 64,
386
  "trial_name": null,
387
  "trial_params": null
388
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0bf29b9cfaa423385553cd4c1a9ee40cd1712cd86c4bf40b3c7f39094ea7217
3
  size 4920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2d9f5cc8c59e3763badc8aaac1a4d41fe76330287aca603b90141946989fe67
3
  size 4920