rlaaudrb1104 commited on
Commit
c6db5b6
·
1 Parent(s): 6dee97d

ccit_final

Browse files
Files changed (6) hide show
  1. config.json +1 -1
  2. model.safetensors +2 -2
  3. optimizer.pt +2 -2
  4. rng_state.pth +1 -1
  5. scheduler.pt +1 -1
  6. trainer_state.json +501 -164
config.json CHANGED
@@ -20,7 +20,7 @@
20
  "pad_token_id": 1,
21
  "position_embedding_type": "absolute",
22
  "torch_dtype": "float32",
23
- "transformers_version": "4.41.1",
24
  "type_vocab_size": 1,
25
  "use_cache": true,
26
  "vocab_size": 50265
 
20
  "pad_token_id": 1,
21
  "position_embedding_type": "absolute",
22
  "torch_dtype": "float32",
23
+ "transformers_version": "4.41.2",
24
  "type_vocab_size": 1,
25
  "use_cache": true,
26
  "vocab_size": 50265
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9bcae3293b5d29ff9f520b875765460e825e12ec2ecccdf5fb2635ab3d058ec
3
- size 498692860
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6aad7c2f42e665fe68a35f0abf79a2483077c39dd42fba313390c6515423a69
3
+ size 498705152
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eeda6c76b7102eb22693937d10b880ae3af91f76df0a3a54359358853fc95e33
3
- size 997506746
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7e0a9772004bf6dc7cb7d761cdf5209bcbc125cb4ff8281b6b4af487abc8608
3
+ size 997531514
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfc880f662532368769bf627f8612e2c4b90986f8b4c7b89b3f45762dc6d424d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4297877051912922800eeda2dfabec7286e3f19a18bb48fac3d835eb91c6dbae
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1b227ad2a10700168d2b014fe9a1d55c46d4ec5e2f18bd52971da8f64d4b44b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db3e53efd18aae3a98eba7ce2c13a309a6e1555c5a2fe5b6c71d6270566d2fef
3
  size 1064
trainer_state.json CHANGED
@@ -1,219 +1,556 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.685408299866131,
5
  "eval_steps": 500,
6
- "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.6693440428380187,
13
- "grad_norm": 14.38033676147461,
14
  "learning_rate": 4.97e-05,
15
- "loss": 1.6741,
16
  "step": 500
17
  },
18
  {
19
- "epoch": 0.6693440428380187,
20
- "eval_accuracy": 0.6358458961474037,
21
- "eval_f1_class_0": 0.9606299212598425,
22
- "eval_f1_class_1": 0.4562647754137116,
23
- "eval_f1_class_2": 0.5818673883626523,
24
- "eval_f1_class_3": 0.6752,
25
- "eval_f1_class_4": 0.4695259593679458,
26
- "eval_f1_class_5": 0.6699186991869919,
27
- "eval_f1_class_6": 0.6111908177905309,
28
- "eval_f1_class_7": 0.6285714285714286,
29
- "eval_f1_class_8": 0.7056603773584904,
30
- "eval_f1_macro": 0.6398699297012882,
31
- "eval_f1_micro": 0.6358458961474037,
32
- "eval_f1_weighted": 0.6376816033602213,
33
- "eval_loss": 1.0920162200927734,
34
- "eval_recall_weighted": 0.6358458961474037,
35
- "eval_runtime": 52.0874,
36
- "eval_samples_per_second": 57.307,
37
- "eval_steps_per_second": 1.805,
 
38
  "step": 500
39
  },
40
  {
41
- "epoch": 1.3386880856760375,
42
- "grad_norm": 13.991486549377441,
43
- "learning_rate": 4.643472022955524e-05,
44
- "loss": 0.77,
45
  "step": 1000
46
  },
47
  {
48
- "epoch": 1.3386880856760375,
49
- "eval_accuracy": 0.8395309882747068,
50
- "eval_f1_class_0": 0.9654088050314467,
51
- "eval_f1_class_1": 0.7083333333333334,
52
- "eval_f1_class_2": 0.8022598870056498,
53
- "eval_f1_class_3": 0.8431952662721893,
54
- "eval_f1_class_4": 0.7962674961119751,
55
- "eval_f1_class_5": 0.8715447154471545,
56
- "eval_f1_class_6": 0.8243430152143845,
57
- "eval_f1_class_7": 0.8963210702341138,
58
- "eval_f1_class_8": 0.8755760368663594,
59
- "eval_f1_macro": 0.8425832917240673,
60
- "eval_f1_micro": 0.8395309882747068,
61
- "eval_f1_weighted": 0.8406783887287406,
62
- "eval_loss": 0.5749590992927551,
63
- "eval_recall_weighted": 0.8395309882747068,
64
- "eval_runtime": 52.1038,
65
- "eval_samples_per_second": 57.289,
66
- "eval_steps_per_second": 1.804,
 
67
  "step": 1000
68
  },
69
  {
70
- "epoch": 2.0080321285140563,
71
- "grad_norm": 2.8914544582366943,
72
- "learning_rate": 4.284791965566714e-05,
73
- "loss": 0.4007,
74
  "step": 1500
75
  },
76
  {
77
- "epoch": 2.0080321285140563,
78
- "eval_accuracy": 0.8931323283082077,
79
- "eval_f1_class_0": 0.96875,
80
- "eval_f1_class_1": 0.7915492957746477,
81
- "eval_f1_class_2": 0.8837209302325582,
82
- "eval_f1_class_3": 0.838235294117647,
83
- "eval_f1_class_4": 0.844574780058651,
84
- "eval_f1_class_5": 0.9504,
85
- "eval_f1_class_6": 0.9249617151607963,
86
- "eval_f1_class_7": 0.9345794392523364,
87
- "eval_f1_class_8": 0.9199999999999999,
88
- "eval_f1_macro": 0.895196828288515,
89
- "eval_f1_micro": 0.8931323283082077,
90
- "eval_f1_weighted": 0.8935187314222036,
91
- "eval_loss": 0.3943232595920563,
92
- "eval_recall_weighted": 0.8931323283082077,
93
- "eval_runtime": 52.1055,
94
- "eval_samples_per_second": 57.288,
95
- "eval_steps_per_second": 1.804,
 
96
  "step": 1500
97
  },
98
  {
99
- "epoch": 2.677376171352075,
100
- "grad_norm": 2.264118194580078,
101
- "learning_rate": 3.926111908177906e-05,
102
- "loss": 0.2025,
103
  "step": 2000
104
  },
105
  {
106
- "epoch": 2.677376171352075,
107
- "eval_accuracy": 0.9165829145728643,
108
- "eval_f1_class_0": 0.9734789391575663,
109
- "eval_f1_class_1": 0.8170563961485556,
110
- "eval_f1_class_2": 0.9121338912133892,
111
- "eval_f1_class_3": 0.910384068278805,
112
- "eval_f1_class_4": 0.8715447154471545,
113
- "eval_f1_class_5": 0.9562499999999999,
114
- "eval_f1_class_6": 0.9147058823529411,
115
- "eval_f1_class_7": 0.9559748427672956,
116
- "eval_f1_class_8": 0.9525368248772503,
117
- "eval_f1_macro": 0.918229506693662,
118
- "eval_f1_micro": 0.9165829145728643,
119
- "eval_f1_weighted": 0.9169458307013622,
120
- "eval_loss": 0.392281711101532,
121
- "eval_recall_weighted": 0.9165829145728643,
122
- "eval_runtime": 52.1029,
123
- "eval_samples_per_second": 57.29,
124
- "eval_steps_per_second": 1.804,
 
125
  "step": 2000
126
  },
127
  {
128
- "epoch": 3.3467202141900936,
129
- "grad_norm": 9.838397979736328,
130
- "learning_rate": 3.5674318507890965e-05,
131
- "loss": 0.1351,
132
  "step": 2500
133
  },
134
  {
135
- "epoch": 3.3467202141900936,
136
- "eval_accuracy": 0.9252931323283082,
137
- "eval_f1_class_0": 0.9750778816199377,
138
- "eval_f1_class_1": 0.8636363636363636,
139
- "eval_f1_class_2": 0.909814323607427,
140
- "eval_f1_class_3": 0.8973607038123167,
141
- "eval_f1_class_4": 0.8852459016393442,
142
- "eval_f1_class_5": 0.9549839228295819,
143
- "eval_f1_class_6": 0.9319526627218935,
144
- "eval_f1_class_7": 0.9635499207606973,
145
- "eval_f1_class_8": 0.9556962025316457,
146
- "eval_f1_macro": 0.9263686536843565,
147
- "eval_f1_micro": 0.9252931323283082,
148
- "eval_f1_weighted": 0.9251223151765453,
149
- "eval_loss": 0.38727396726608276,
150
- "eval_recall_weighted": 0.9252931323283082,
151
- "eval_runtime": 52.0796,
152
- "eval_samples_per_second": 57.316,
153
- "eval_steps_per_second": 1.805,
 
154
  "step": 2500
155
  },
156
  {
157
- "epoch": 4.016064257028113,
158
- "grad_norm": 3.1932365894317627,
159
- "learning_rate": 3.208751793400287e-05,
160
- "loss": 0.1193,
161
  "step": 3000
162
  },
163
  {
164
- "epoch": 4.016064257028113,
165
- "eval_accuracy": 0.9366834170854271,
166
- "eval_f1_class_0": 0.9753086419753085,
167
- "eval_f1_class_1": 0.8781954887218045,
168
- "eval_f1_class_2": 0.9465020576131686,
169
- "eval_f1_class_3": 0.8873626373626373,
170
- "eval_f1_class_4": 0.9245283018867925,
171
- "eval_f1_class_5": 0.9478672985781991,
172
- "eval_f1_class_6": 0.948301329394387,
173
- "eval_f1_class_7": 0.9695999999999999,
174
- "eval_f1_class_8": 0.9602543720190778,
175
- "eval_f1_macro": 0.9375466808390418,
176
- "eval_f1_micro": 0.9366834170854271,
177
- "eval_f1_weighted": 0.9367077203287543,
178
- "eval_loss": 0.36861082911491394,
179
- "eval_recall_weighted": 0.9366834170854271,
180
- "eval_runtime": 52.0777,
181
- "eval_samples_per_second": 57.318,
182
- "eval_steps_per_second": 1.805,
 
183
  "step": 3000
184
  },
185
  {
186
- "epoch": 4.685408299866131,
187
- "grad_norm": 16.49643325805664,
188
- "learning_rate": 2.850071736011478e-05,
189
- "loss": 0.063,
190
  "step": 3500
191
  },
192
  {
193
- "epoch": 4.685408299866131,
194
- "eval_accuracy": 0.9396984924623115,
195
- "eval_f1_class_0": 0.9751552795031054,
196
- "eval_f1_class_1": 0.8555240793201134,
197
- "eval_f1_class_2": 0.9222222222222223,
198
- "eval_f1_class_3": 0.9184549356223176,
199
- "eval_f1_class_4": 0.9200603318250377,
200
- "eval_f1_class_5": 0.9730586370839936,
201
- "eval_f1_class_6": 0.9650986342943855,
202
- "eval_f1_class_7": 0.964968152866242,
203
- "eval_f1_class_8": 0.9774193548387097,
204
- "eval_f1_macro": 0.9413290697306808,
205
- "eval_f1_micro": 0.9396984924623115,
206
- "eval_f1_weighted": 0.9399337568761867,
207
- "eval_loss": 0.3695576786994934,
208
- "eval_recall_weighted": 0.9396984924623115,
209
- "eval_runtime": 52.0778,
210
- "eval_samples_per_second": 57.318,
211
- "eval_steps_per_second": 1.805,
 
212
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  }
214
  ],
215
  "logging_steps": 500,
216
- "max_steps": 7470,
217
  "num_input_tokens_seen": 0,
218
  "num_train_epochs": 10,
219
  "save_steps": 500,
@@ -229,7 +566,7 @@
229
  "attributes": {}
230
  }
231
  },
232
- "total_flos": 1.47218650942464e+16,
233
  "train_batch_size": 16,
234
  "trial_name": null,
235
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.433962264150944,
5
  "eval_steps": 500,
6
+ "global_step": 9000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.5241090146750524,
13
+ "grad_norm": 14.80826473236084,
14
  "learning_rate": 4.97e-05,
15
+ "loss": 1.8113,
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 0.5241090146750524,
20
+ "eval_accuracy": 0.6428945988463556,
21
+ "eval_f1_class_0": 0.632183908045977,
22
+ "eval_f1_class_1": 0.9472222222222223,
23
+ "eval_f1_class_2": 0.3657331136738056,
24
+ "eval_f1_class_3": 0.5542168674698795,
25
+ "eval_f1_class_4": 0.6165254237288136,
26
+ "eval_f1_class_5": 0.6261808367071524,
27
+ "eval_f1_class_6": 0.6833541927409262,
28
+ "eval_f1_class_7": 0.5899053627760252,
29
+ "eval_f1_class_8": 0.662280701754386,
30
+ "eval_f1_class_9": 0.6956521739130435,
31
+ "eval_f1_macro": 0.6373254803032232,
32
+ "eval_f1_micro": 0.6428945988463556,
33
+ "eval_f1_weighted": 0.6376303124409798,
34
+ "eval_loss": 1.1190885305404663,
35
+ "eval_recall_weighted": 0.6428945988463556,
36
+ "eval_runtime": 18.5604,
37
+ "eval_samples_per_second": 205.491,
38
+ "eval_steps_per_second": 6.465,
39
  "step": 500
40
  },
41
  {
42
+ "epoch": 1.0482180293501049,
43
+ "grad_norm": 6.733876705169678,
44
+ "learning_rate": 4.725110619469027e-05,
45
+ "loss": 0.885,
46
  "step": 1000
47
  },
48
  {
49
+ "epoch": 1.0482180293501049,
50
+ "eval_accuracy": 0.8185631882538018,
51
+ "eval_f1_class_0": 0.7974522292993631,
52
+ "eval_f1_class_1": 0.9651324965132496,
53
+ "eval_f1_class_2": 0.6306569343065694,
54
+ "eval_f1_class_3": 0.7193229901269393,
55
+ "eval_f1_class_4": 0.8341584158415841,
56
+ "eval_f1_class_5": 0.807061790668348,
57
+ "eval_f1_class_6": 0.8753246753246754,
58
+ "eval_f1_class_7": 0.8368794326241134,
59
+ "eval_f1_class_8": 0.8787446504992867,
60
+ "eval_f1_class_9": 0.8255528255528255,
61
+ "eval_f1_macro": 0.8170286440756953,
62
+ "eval_f1_micro": 0.8185631882538018,
63
+ "eval_f1_weighted": 0.8175319894578669,
64
+ "eval_loss": 0.6289433836936951,
65
+ "eval_recall_weighted": 0.8185631882538018,
66
+ "eval_runtime": 18.5795,
67
+ "eval_samples_per_second": 205.28,
68
+ "eval_steps_per_second": 6.459,
69
  "step": 1000
70
  },
71
  {
72
+ "epoch": 1.5723270440251573,
73
+ "grad_norm": 21.78145408630371,
74
+ "learning_rate": 4.449115044247788e-05,
75
+ "loss": 0.4651,
76
  "step": 1500
77
  },
78
  {
79
+ "epoch": 1.5723270440251573,
80
+ "eval_accuracy": 0.8615626638699528,
81
+ "eval_f1_class_0": 0.8236877523553162,
82
+ "eval_f1_class_1": 0.9653259361997226,
83
+ "eval_f1_class_2": 0.6986128625472887,
84
+ "eval_f1_class_3": 0.8308457711442785,
85
+ "eval_f1_class_4": 0.8667563930013459,
86
+ "eval_f1_class_5": 0.8346456692913385,
87
+ "eval_f1_class_6": 0.8898305084745762,
88
+ "eval_f1_class_7": 0.9136420525657072,
89
+ "eval_f1_class_8": 0.8668280871670703,
90
+ "eval_f1_class_9": 0.9410150891632374,
91
+ "eval_f1_macro": 0.8631190121909882,
92
+ "eval_f1_micro": 0.8615626638699528,
93
+ "eval_f1_weighted": 0.8631956720576096,
94
+ "eval_loss": 0.5186640024185181,
95
+ "eval_recall_weighted": 0.8615626638699528,
96
+ "eval_runtime": 18.5317,
97
+ "eval_samples_per_second": 205.809,
98
+ "eval_steps_per_second": 6.475,
99
  "step": 1500
100
  },
101
  {
102
+ "epoch": 2.0964360587002098,
103
+ "grad_norm": 6.887778282165527,
104
+ "learning_rate": 4.172566371681416e-05,
105
+ "loss": 0.3292,
106
  "step": 2000
107
  },
108
  {
109
+ "epoch": 2.0964360587002098,
110
+ "eval_accuracy": 0.9003670686942842,
111
+ "eval_f1_class_0": 0.8623188405797102,
112
+ "eval_f1_class_1": 0.9615384615384616,
113
+ "eval_f1_class_2": 0.7830045523520485,
114
+ "eval_f1_class_3": 0.9002624671916011,
115
+ "eval_f1_class_4": 0.8952618453865336,
116
+ "eval_f1_class_5": 0.8938271604938272,
117
+ "eval_f1_class_6": 0.9402173913043479,
118
+ "eval_f1_class_7": 0.9341935483870967,
119
+ "eval_f1_class_8": 0.9295039164490861,
120
+ "eval_f1_class_9": 0.8950131233595802,
121
+ "eval_f1_macro": 0.8995141307042294,
122
+ "eval_f1_micro": 0.9003670686942842,
123
+ "eval_f1_weighted": 0.899989630913346,
124
+ "eval_loss": 0.37930119037628174,
125
+ "eval_recall_weighted": 0.9003670686942842,
126
+ "eval_runtime": 18.5937,
127
+ "eval_samples_per_second": 205.123,
128
+ "eval_steps_per_second": 6.454,
129
  "step": 2000
130
  },
131
  {
132
+ "epoch": 2.620545073375262,
133
+ "grad_norm": 18.899120330810547,
134
+ "learning_rate": 3.896017699115044e-05,
135
+ "loss": 0.2064,
136
  "step": 2500
137
  },
138
  {
139
+ "epoch": 2.620545073375262,
140
+ "eval_accuracy": 0.916885159937074,
141
+ "eval_f1_class_0": 0.8660049627791564,
142
+ "eval_f1_class_1": 0.9670329670329672,
143
+ "eval_f1_class_2": 0.8037889039242219,
144
+ "eval_f1_class_3": 0.9132653061224489,
145
+ "eval_f1_class_4": 0.9252217997465145,
146
+ "eval_f1_class_5": 0.896719319562576,
147
+ "eval_f1_class_6": 0.9539295392953929,
148
+ "eval_f1_class_7": 0.9450000000000001,
149
+ "eval_f1_class_8": 0.9398601398601398,
150
+ "eval_f1_class_9": 0.9660056657223797,
151
+ "eval_f1_macro": 0.9176828604045797,
152
+ "eval_f1_micro": 0.916885159937074,
153
+ "eval_f1_weighted": 0.9176253934359663,
154
+ "eval_loss": 0.34075525403022766,
155
+ "eval_recall_weighted": 0.916885159937074,
156
+ "eval_runtime": 18.4847,
157
+ "eval_samples_per_second": 206.333,
158
+ "eval_steps_per_second": 6.492,
159
  "step": 2500
160
  },
161
  {
162
+ "epoch": 3.1446540880503147,
163
+ "grad_norm": 1.2899887561798096,
164
+ "learning_rate": 3.619469026548672e-05,
165
+ "loss": 0.1562,
166
  "step": 3000
167
  },
168
  {
169
+ "epoch": 3.1446540880503147,
170
+ "eval_accuracy": 0.9326166754063975,
171
+ "eval_f1_class_0": 0.8903061224489796,
172
+ "eval_f1_class_1": 0.9594594594594594,
173
+ "eval_f1_class_2": 0.8427128427128427,
174
+ "eval_f1_class_3": 0.9363867684478372,
175
+ "eval_f1_class_4": 0.9300000000000002,
176
+ "eval_f1_class_5": 0.9108433734939759,
177
+ "eval_f1_class_6": 0.9784366576819407,
178
+ "eval_f1_class_7": 0.9619289340101522,
179
+ "eval_f1_class_8": 0.9572192513368984,
180
+ "eval_f1_class_9": 0.9567642956764295,
181
+ "eval_f1_macro": 0.9324057705268516,
182
+ "eval_f1_micro": 0.9326166754063975,
183
+ "eval_f1_weighted": 0.9323951969454738,
184
+ "eval_loss": 0.29545533657073975,
185
+ "eval_recall_weighted": 0.9326166754063975,
186
+ "eval_runtime": 18.4597,
187
+ "eval_samples_per_second": 206.612,
188
+ "eval_steps_per_second": 6.501,
189
  "step": 3000
190
  },
191
  {
192
+ "epoch": 3.668763102725367,
193
+ "grad_norm": 7.331942558288574,
194
+ "learning_rate": 3.342920353982301e-05,
195
+ "loss": 0.1097,
196
  "step": 3500
197
  },
198
  {
199
+ "epoch": 3.668763102725367,
200
+ "eval_accuracy": 0.936811746198217,
201
+ "eval_f1_class_0": 0.8847497089639115,
202
+ "eval_f1_class_1": 0.967391304347826,
203
+ "eval_f1_class_2": 0.8646153846153847,
204
+ "eval_f1_class_3": 0.9375830013280213,
205
+ "eval_f1_class_4": 0.9284818067754077,
206
+ "eval_f1_class_5": 0.9315068493150684,
207
+ "eval_f1_class_6": 0.9733333333333333,
208
+ "eval_f1_class_7": 0.9563046192259677,
209
+ "eval_f1_class_8": 0.9731903485254693,
210
+ "eval_f1_class_9": 0.9495225102319237,
211
+ "eval_f1_macro": 0.9366678866662314,
212
+ "eval_f1_micro": 0.936811746198217,
213
+ "eval_f1_weighted": 0.9365912559521494,
214
+ "eval_loss": 0.3290887176990509,
215
+ "eval_recall_weighted": 0.936811746198217,
216
+ "eval_runtime": 18.5309,
217
+ "eval_samples_per_second": 205.818,
218
+ "eval_steps_per_second": 6.476,
219
  "step": 3500
220
+ },
221
+ {
222
+ "epoch": 4.1928721174004195,
223
+ "grad_norm": 0.010240758769214153,
224
+ "learning_rate": 3.066924778761062e-05,
225
+ "loss": 0.1123,
226
+ "step": 4000
227
+ },
228
+ {
229
+ "epoch": 4.1928721174004195,
230
+ "eval_accuracy": 0.9431043523859465,
231
+ "eval_f1_class_0": 0.9055118110236221,
232
+ "eval_f1_class_1": 0.9780821917808219,
233
+ "eval_f1_class_2": 0.8661870503597122,
234
+ "eval_f1_class_3": 0.9431524547803619,
235
+ "eval_f1_class_4": 0.9369592088998764,
236
+ "eval_f1_class_5": 0.9245742092457422,
237
+ "eval_f1_class_6": 0.9734042553191489,
238
+ "eval_f1_class_7": 0.958904109589041,
239
+ "eval_f1_class_8": 0.9736842105263158,
240
+ "eval_f1_class_9": 0.9680998613037449,
241
+ "eval_f1_macro": 0.9428559362828388,
242
+ "eval_f1_micro": 0.9431043523859465,
243
+ "eval_f1_weighted": 0.9427303581875908,
244
+ "eval_loss": 0.3067891001701355,
245
+ "eval_recall_weighted": 0.9431043523859465,
246
+ "eval_runtime": 18.589,
247
+ "eval_samples_per_second": 205.175,
248
+ "eval_steps_per_second": 6.455,
249
+ "step": 4000
250
+ },
251
+ {
252
+ "epoch": 4.716981132075472,
253
+ "grad_norm": 0.4483562707901001,
254
+ "learning_rate": 2.7903761061946903e-05,
255
+ "loss": 0.0732,
256
+ "step": 4500
257
+ },
258
+ {
259
+ "epoch": 4.716981132075472,
260
+ "eval_accuracy": 0.940744625065548,
261
+ "eval_f1_class_0": 0.8982826948480845,
262
+ "eval_f1_class_1": 0.9741496598639455,
263
+ "eval_f1_class_2": 0.8587570621468926,
264
+ "eval_f1_class_3": 0.940127388535032,
265
+ "eval_f1_class_4": 0.9477707006369427,
266
+ "eval_f1_class_5": 0.9290012033694345,
267
+ "eval_f1_class_6": 0.972972972972973,
268
+ "eval_f1_class_7": 0.9485294117647058,
269
+ "eval_f1_class_8": 0.9693741677762983,
270
+ "eval_f1_class_9": 0.9666666666666666,
271
+ "eval_f1_macro": 0.9405631928580975,
272
+ "eval_f1_micro": 0.940744625065548,
273
+ "eval_f1_weighted": 0.9405207858122142,
274
+ "eval_loss": 0.3208290636539459,
275
+ "eval_recall_weighted": 0.940744625065548,
276
+ "eval_runtime": 18.5774,
277
+ "eval_samples_per_second": 205.303,
278
+ "eval_steps_per_second": 6.459,
279
+ "step": 4500
280
+ },
281
+ {
282
+ "epoch": 5.241090146750524,
283
+ "grad_norm": 0.00789484940469265,
284
+ "learning_rate": 2.5138274336283185e-05,
285
+ "loss": 0.0582,
286
+ "step": 5000
287
+ },
288
+ {
289
+ "epoch": 5.241090146750524,
290
+ "eval_accuracy": 0.9444153120083901,
291
+ "eval_f1_class_0": 0.8981132075471698,
292
+ "eval_f1_class_1": 0.9780821917808219,
293
+ "eval_f1_class_2": 0.8748241912798874,
294
+ "eval_f1_class_3": 0.9479166666666665,
295
+ "eval_f1_class_4": 0.9436795994993743,
296
+ "eval_f1_class_5": 0.942189421894219,
297
+ "eval_f1_class_6": 0.9613259668508287,
298
+ "eval_f1_class_7": 0.9551122194513716,
299
+ "eval_f1_class_8": 0.9713541666666666,
300
+ "eval_f1_class_9": 0.9721448467966574,
301
+ "eval_f1_macro": 0.9444742478433662,
302
+ "eval_f1_micro": 0.9444153120083901,
303
+ "eval_f1_weighted": 0.9443976005879655,
304
+ "eval_loss": 0.3178301155567169,
305
+ "eval_recall_weighted": 0.9444153120083901,
306
+ "eval_runtime": 18.6189,
307
+ "eval_samples_per_second": 204.845,
308
+ "eval_steps_per_second": 6.445,
309
+ "step": 5000
310
+ },
311
+ {
312
+ "epoch": 5.765199161425577,
313
+ "grad_norm": 0.2585061490535736,
314
+ "learning_rate": 2.237278761061947e-05,
315
+ "loss": 0.0533,
316
+ "step": 5500
317
+ },
318
+ {
319
+ "epoch": 5.765199161425577,
320
+ "eval_accuracy": 0.951232302045097,
321
+ "eval_f1_class_0": 0.9144316730523627,
322
+ "eval_f1_class_1": 0.9754768392370572,
323
+ "eval_f1_class_2": 0.8732782369146006,
324
+ "eval_f1_class_3": 0.9662337662337663,
325
+ "eval_f1_class_4": 0.958974358974359,
326
+ "eval_f1_class_5": 0.9410288582183187,
327
+ "eval_f1_class_6": 0.9840848806366048,
328
+ "eval_f1_class_7": 0.9597989949748743,
329
+ "eval_f1_class_8": 0.9712793733681462,
330
+ "eval_f1_class_9": 0.9667590027700832,
331
+ "eval_f1_macro": 0.9511345984380173,
332
+ "eval_f1_micro": 0.9512323020450971,
333
+ "eval_f1_weighted": 0.9512610025073973,
334
+ "eval_loss": 0.30544909834861755,
335
+ "eval_recall_weighted": 0.951232302045097,
336
+ "eval_runtime": 18.5675,
337
+ "eval_samples_per_second": 205.412,
338
+ "eval_steps_per_second": 6.463,
339
+ "step": 5500
340
+ },
341
+ {
342
+ "epoch": 6.289308176100629,
343
+ "grad_norm": 0.020399658009409904,
344
+ "learning_rate": 1.9607300884955755e-05,
345
+ "loss": 0.0424,
346
+ "step": 6000
347
+ },
348
+ {
349
+ "epoch": 6.289308176100629,
350
+ "eval_accuracy": 0.9520188778185632,
351
+ "eval_f1_class_0": 0.9041450777202072,
352
+ "eval_f1_class_1": 0.9782016348773842,
353
+ "eval_f1_class_2": 0.8834019204389574,
354
+ "eval_f1_class_3": 0.9584415584415584,
355
+ "eval_f1_class_4": 0.9491094147582698,
356
+ "eval_f1_class_5": 0.9521472392638036,
357
+ "eval_f1_class_6": 0.9824086603518267,
358
+ "eval_f1_class_7": 0.9638854296388544,
359
+ "eval_f1_class_8": 0.9814323607427056,
360
+ "eval_f1_class_9": 0.9669421487603306,
361
+ "eval_f1_macro": 0.9520115444993896,
362
+ "eval_f1_micro": 0.9520188778185632,
363
+ "eval_f1_weighted": 0.9520110205225185,
364
+ "eval_loss": 0.30924680829048157,
365
+ "eval_recall_weighted": 0.9520188778185632,
366
+ "eval_runtime": 18.5678,
367
+ "eval_samples_per_second": 205.41,
368
+ "eval_steps_per_second": 6.463,
369
+ "step": 6000
370
+ },
371
+ {
372
+ "epoch": 6.813417190775681,
373
+ "grad_norm": 3.3116018772125244,
374
+ "learning_rate": 1.6841814159292034e-05,
375
+ "loss": 0.0378,
376
+ "step": 6500
377
+ },
378
+ {
379
+ "epoch": 6.813417190775681,
380
+ "eval_accuracy": 0.9530676455165181,
381
+ "eval_f1_class_0": 0.9155107187894074,
382
+ "eval_f1_class_1": 0.9768707482993196,
383
+ "eval_f1_class_2": 0.8838526912181304,
384
+ "eval_f1_class_3": 0.9488491048593349,
385
+ "eval_f1_class_4": 0.951885565669701,
386
+ "eval_f1_class_5": 0.9431680773881501,
387
+ "eval_f1_class_6": 0.9813829787234043,
388
+ "eval_f1_class_7": 0.9669211195928754,
389
+ "eval_f1_class_8": 0.9868073878627968,
390
+ "eval_f1_class_9": 0.9750000000000001,
391
+ "eval_f1_macro": 0.953024839240312,
392
+ "eval_f1_micro": 0.9530676455165181,
393
+ "eval_f1_weighted": 0.952958033459228,
394
+ "eval_loss": 0.30203908681869507,
395
+ "eval_recall_weighted": 0.9530676455165181,
396
+ "eval_runtime": 18.5121,
397
+ "eval_samples_per_second": 206.028,
398
+ "eval_steps_per_second": 6.482,
399
+ "step": 6500
400
+ },
401
+ {
402
+ "epoch": 7.337526205450734,
403
+ "grad_norm": 0.005160727072507143,
404
+ "learning_rate": 1.4081858407079645e-05,
405
+ "loss": 0.0305,
406
+ "step": 7000
407
+ },
408
+ {
409
+ "epoch": 7.337526205450734,
410
+ "eval_accuracy": 0.954116413214473,
411
+ "eval_f1_class_0": 0.9171817058096415,
412
+ "eval_f1_class_1": 0.9822646657571624,
413
+ "eval_f1_class_2": 0.8885672937771346,
414
+ "eval_f1_class_3": 0.95822454308094,
415
+ "eval_f1_class_4": 0.952020202020202,
416
+ "eval_f1_class_5": 0.9428918590522478,
417
+ "eval_f1_class_6": 0.9761273209549071,
418
+ "eval_f1_class_7": 0.9647355163727961,
419
+ "eval_f1_class_8": 0.9866310160427807,
420
+ "eval_f1_class_9": 0.9721448467966574,
421
+ "eval_f1_macro": 0.9540788969664471,
422
+ "eval_f1_micro": 0.954116413214473,
423
+ "eval_f1_weighted": 0.9540130488123523,
424
+ "eval_loss": 0.28602519631385803,
425
+ "eval_recall_weighted": 0.954116413214473,
426
+ "eval_runtime": 18.523,
427
+ "eval_samples_per_second": 205.906,
428
+ "eval_steps_per_second": 6.478,
429
+ "step": 7000
430
+ },
431
+ {
432
+ "epoch": 7.861635220125786,
433
+ "grad_norm": 0.02748439833521843,
434
+ "learning_rate": 1.131637168141593e-05,
435
+ "loss": 0.0216,
436
+ "step": 7500
437
+ },
438
+ {
439
+ "epoch": 7.861635220125786,
440
+ "eval_accuracy": 0.9562139486103828,
441
+ "eval_f1_class_0": 0.9250317662007626,
442
+ "eval_f1_class_1": 0.9794801641586868,
443
+ "eval_f1_class_2": 0.8888888888888887,
444
+ "eval_f1_class_3": 0.9527458492975734,
445
+ "eval_f1_class_4": 0.9604086845466155,
446
+ "eval_f1_class_5": 0.9476248477466506,
447
+ "eval_f1_class_6": 0.9853137516688919,
448
+ "eval_f1_class_7": 0.9682337992376113,
449
+ "eval_f1_class_8": 0.9790575916230366,
450
+ "eval_f1_class_9": 0.9736477115117891,
451
+ "eval_f1_macro": 0.9560433054880507,
452
+ "eval_f1_micro": 0.9562139486103828,
453
+ "eval_f1_weighted": 0.956078385428201,
454
+ "eval_loss": 0.303732693195343,
455
+ "eval_recall_weighted": 0.9562139486103828,
456
+ "eval_runtime": 18.6679,
457
+ "eval_samples_per_second": 204.308,
458
+ "eval_steps_per_second": 6.428,
459
+ "step": 7500
460
+ },
461
+ {
462
+ "epoch": 8.385744234800839,
463
+ "grad_norm": 0.017909903079271317,
464
+ "learning_rate": 8.550884955752212e-06,
465
+ "loss": 0.0198,
466
+ "step": 8000
467
+ },
468
+ {
469
+ "epoch": 8.385744234800839,
470
+ "eval_accuracy": 0.9562139486103828,
471
+ "eval_f1_class_0": 0.9207547169811321,
472
+ "eval_f1_class_1": 0.9795361527967258,
473
+ "eval_f1_class_2": 0.8829337094499294,
474
+ "eval_f1_class_3": 0.9623865110246432,
475
+ "eval_f1_class_4": 0.9592875318066159,
476
+ "eval_f1_class_5": 0.9451887941534713,
477
+ "eval_f1_class_6": 0.9826897470039946,
478
+ "eval_f1_class_7": 0.9707006369426752,
479
+ "eval_f1_class_8": 0.9855072463768115,
480
+ "eval_f1_class_9": 0.9721448467966574,
481
+ "eval_f1_macro": 0.9561129893332657,
482
+ "eval_f1_micro": 0.9562139486103828,
483
+ "eval_f1_weighted": 0.9561806063383737,
484
+ "eval_loss": 0.2868480682373047,
485
+ "eval_recall_weighted": 0.9562139486103828,
486
+ "eval_runtime": 18.5749,
487
+ "eval_samples_per_second": 205.331,
488
+ "eval_steps_per_second": 6.46,
489
+ "step": 8000
490
+ },
491
+ {
492
+ "epoch": 8.90985324947589,
493
+ "grad_norm": 0.019022395834326744,
494
+ "learning_rate": 5.785398230088496e-06,
495
+ "loss": 0.0145,
496
+ "step": 8500
497
+ },
498
+ {
499
+ "epoch": 8.90985324947589,
500
+ "eval_accuracy": 0.9580492920818039,
501
+ "eval_f1_class_0": 0.9221260815822002,
502
+ "eval_f1_class_1": 0.9754768392370572,
503
+ "eval_f1_class_2": 0.8932584269662921,
504
+ "eval_f1_class_3": 0.9621903520208606,
505
+ "eval_f1_class_4": 0.9541984732824428,
506
+ "eval_f1_class_5": 0.957920792079208,
507
+ "eval_f1_class_6": 0.9879518072289156,
508
+ "eval_f1_class_7": 0.9671717171717172,
509
+ "eval_f1_class_8": 0.9867724867724869,
510
+ "eval_f1_class_9": 0.9735006973500697,
511
+ "eval_f1_macro": 0.9580567673691249,
512
+ "eval_f1_micro": 0.9580492920818039,
513
+ "eval_f1_weighted": 0.9581079328217166,
514
+ "eval_loss": 0.29160091280937195,
515
+ "eval_recall_weighted": 0.9580492920818039,
516
+ "eval_runtime": 18.5713,
517
+ "eval_samples_per_second": 205.371,
518
+ "eval_steps_per_second": 6.462,
519
+ "step": 8500
520
+ },
521
+ {
522
+ "epoch": 9.433962264150944,
523
+ "grad_norm": 0.0038110397290438414,
524
+ "learning_rate": 3.019911504424779e-06,
525
+ "loss": 0.014,
526
+ "step": 9000
527
+ },
528
+ {
529
+ "epoch": 9.433962264150944,
530
+ "eval_accuracy": 0.959884635553225,
531
+ "eval_f1_class_0": 0.9228855721393034,
532
+ "eval_f1_class_1": 0.9809264305177112,
533
+ "eval_f1_class_2": 0.9080459770114943,
534
+ "eval_f1_class_3": 0.9571984435797665,
535
+ "eval_f1_class_4": 0.9566326530612245,
536
+ "eval_f1_class_5": 0.9585365853658537,
537
+ "eval_f1_class_6": 0.9840848806366048,
538
+ "eval_f1_class_7": 0.9745547073791349,
539
+ "eval_f1_class_8": 0.9868073878627968,
540
+ "eval_f1_class_9": 0.9680998613037449,
541
+ "eval_f1_macro": 0.9597772498857635,
542
+ "eval_f1_micro": 0.959884635553225,
543
+ "eval_f1_weighted": 0.9597604965241316,
544
+ "eval_loss": 0.28916341066360474,
545
+ "eval_recall_weighted": 0.959884635553225,
546
+ "eval_runtime": 18.5578,
547
+ "eval_samples_per_second": 205.52,
548
+ "eval_steps_per_second": 6.466,
549
+ "step": 9000
550
  }
551
  ],
552
  "logging_steps": 500,
553
+ "max_steps": 9540,
554
  "num_input_tokens_seen": 0,
555
  "num_train_epochs": 10,
556
  "save_steps": 500,
 
566
  "attributes": {}
567
  }
568
  },
569
+ "total_flos": 3.787923857566925e+16,
570
  "train_batch_size": 16,
571
  "trial_name": null,
572
  "trial_params": null