Kendamarron commited on
Commit
817bdc0
·
verified ·
1 Parent(s): ebb77eb

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a53bcbc68dec82e2ce8cf2d2d439a8dbf38126c02cc88a567925a6781f6a782f
3
- size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e39308711dbd127efe41850085dc7f2f63d54a44aad2dd2a502891474924b787
3
+ size 83946192
checkpoint-100/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a53bcbc68dec82e2ce8cf2d2d439a8dbf38126c02cc88a567925a6781f6a782f
3
- size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e39308711dbd127efe41850085dc7f2f63d54a44aad2dd2a502891474924b787
3
+ size 83946192
checkpoint-100/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c62536abee8b78adb6147453bd71c877d7d2b4d5a8f8d034d44a921275e11ab
3
  size 335810482
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dacc62b0f0a2216d58e92701202dd5456a079db701e9e321345271c623dd7c6a
3
  size 335810482
checkpoint-100/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:808521b8055ae579535313bc1b5f324216971386eaeef0a87693a238c17a92b3
3
  size 14168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2975b104fc6f24da571473b9b64f8d64d931a14d44726583f5951b3fe5be12b9
3
  size 14168
checkpoint-100/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76b4fa9c11af5ffcd754861813fd859eeac70d14b38222bd674da5b3ddfefcfe
3
  size 1056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dddbc5bcee87f33f86ec113accc5c003fca74582bc423aa05d433c41c6d3cf4e
3
  size 1056
checkpoint-100/trainer_state.json CHANGED
@@ -10,142 +10,142 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
- "grad_norm": 5.041278839111328,
14
  "learning_rate": 2.9968542393565676e-06,
15
- "loss": 1.9371,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.41,
20
- "grad_norm": NaN,
21
- "learning_rate": 2.9717672653473587e-06,
22
- "loss": 1.806,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.61,
27
- "grad_norm": 3.5015058517456055,
28
- "learning_rate": 2.905810057509516e-06,
29
- "loss": 1.7373,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.82,
34
- "grad_norm": 3.583218574523926,
35
- "learning_rate": 2.803067604777227e-06,
36
- "loss": 1.7471,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.02,
41
- "grad_norm": 4.409696102142334,
42
- "learning_rate": 2.666228326019474e-06,
43
- "loss": 1.6622,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 1.22,
48
- "grad_norm": 3.5655429363250732,
49
- "learning_rate": 2.498872837517522e-06,
50
- "loss": 1.6483,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 1.43,
55
- "grad_norm": 4.161169528961182,
56
- "learning_rate": 2.305380260473476e-06,
57
- "loss": 1.674,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 1.63,
62
- "grad_norm": 3.5801784992218018,
63
- "learning_rate": 2.090813634373931e-06,
64
- "loss": 1.6635,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 1.84,
69
- "grad_norm": 3.7053236961364746,
70
- "learning_rate": 1.8607874345493807e-06,
71
- "loss": 1.6857,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 2.04,
76
- "grad_norm": 3.145810127258301,
77
- "learning_rate": 1.6213206605421064e-06,
78
- "loss": 1.6165,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 2.24,
83
- "grad_norm": 3.514664649963379,
84
- "learning_rate": 1.3786793394578939e-06,
85
- "loss": 1.6025,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 2.45,
90
- "grad_norm": 3.9232802391052246,
91
- "learning_rate": 1.13921256545062e-06,
92
- "loss": 1.5615,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 2.65,
97
- "grad_norm": 3.0508198738098145,
98
- "learning_rate": 9.091863656260696e-07,
99
- "loss": 1.6076,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 2.86,
104
- "grad_norm": 3.7754147052764893,
105
- "learning_rate": 6.946197395265243e-07,
106
- "loss": 1.664,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 3.06,
111
- "grad_norm": 3.5631840229034424,
112
- "learning_rate": 5.011271624824787e-07,
113
- "loss": 1.5976,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 3.27,
118
- "grad_norm": 3.4716637134552,
119
- "learning_rate": 3.337716739805264e-07,
120
- "loss": 1.5761,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 3.47,
125
- "grad_norm": 3.624776601791382,
126
- "learning_rate": 1.9693239522277327e-07,
127
- "loss": 1.5924,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 3.67,
132
- "grad_norm": 3.4213175773620605,
133
- "learning_rate": 9.418994249048474e-08,
134
- "loss": 1.6305,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 3.88,
139
- "grad_norm": 3.8403515815734863,
140
- "learning_rate": 2.8232734652641424e-08,
141
- "loss": 1.6219,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 4.08,
146
- "grad_norm": 3.393942356109619,
147
- "learning_rate": 7.866464317276001e-10,
148
- "loss": 1.5561,
149
  "step": 100
150
  }
151
  ],
@@ -154,8 +154,8 @@
154
  "num_input_tokens_seen": 0,
155
  "num_train_epochs": 5,
156
  "save_steps": 20,
157
- "total_flos": 8673113877921792.0,
158
- "train_batch_size": 8,
159
  "trial_name": null,
160
  "trial_params": null
161
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
+ "grad_norm": 5.09375,
14
  "learning_rate": 2.9968542393565676e-06,
15
+ "loss": 1.9388,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.41,
20
+ "grad_norm": 4.4375,
21
+ "learning_rate": 2.9616157869703894e-06,
22
+ "loss": 1.8455,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.61,
27
+ "grad_norm": 3.59375,
28
+ "learning_rate": 2.8881318444640566e-06,
29
+ "loss": 1.7796,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.82,
34
+ "grad_norm": 3.796875,
35
+ "learning_rate": 2.778325235483954e-06,
36
+ "loss": 1.8091,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.02,
41
+ "grad_norm": 4.625,
42
+ "learning_rate": 2.6350692237265428e-06,
43
+ "loss": 1.7224,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 1.22,
48
+ "grad_norm": 3.953125,
49
+ "learning_rate": 2.4621123294467098e-06,
50
+ "loss": 1.7108,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 1.43,
55
+ "grad_norm": 4.125,
56
+ "learning_rate": 2.2639802434931445e-06,
57
+ "loss": 1.7299,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 1.63,
62
+ "grad_norm": 3.625,
63
+ "learning_rate": 2.0458574054452316e-06,
64
+ "loss": 1.7111,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 1.84,
69
+ "grad_norm": 3.53125,
70
+ "learning_rate": 1.813451344546913e-06,
71
+ "loss": 1.7364,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 2.04,
76
+ "grad_norm": 3.1875,
77
+ "learning_rate": 1.5728433331716726e-06,
78
+ "loss": 1.6664,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 2.24,
83
+ "grad_norm": 3.125,
84
+ "learning_rate": 1.3303292607070737e-06,
85
+ "loss": 1.6673,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 2.45,
90
+ "grad_norm": 3.65625,
91
+ "learning_rate": 1.0922548916454855e-06,
92
+ "loss": 1.6219,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 2.65,
97
+ "grad_norm": 2.8125,
98
+ "learning_rate": 8.648498186137653e-07,
99
+ "loss": 1.6648,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 2.86,
104
+ "grad_norm": 3.53125,
105
+ "learning_rate": 6.540644552236401e-07,
106
+ "loss": 1.699,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 3.06,
111
+ "grad_norm": 4.15625,
112
+ "learning_rate": 4.6541433408284356e-07,
113
+ "loss": 1.6821,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 3.27,
118
+ "grad_norm": 3.546875,
119
+ "learning_rate": 3.0383578415591913e-07,
120
+ "loss": 1.6633,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 3.47,
125
+ "grad_norm": 3.4375,
126
+ "learning_rate": 1.7355676390496482e-07,
127
+ "loss": 1.6522,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 3.67,
132
+ "grad_norm": 3.328125,
133
+ "learning_rate": 7.798623006559436e-08,
134
+ "loss": 1.6788,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 3.88,
139
+ "grad_norm": 3.828125,
140
+ "learning_rate": 1.962493689916395e-08,
141
+ "loss": 1.6806,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 4.08,
146
+ "grad_norm": 3.25,
147
+ "learning_rate": 0.0,
148
+ "loss": 1.6213,
149
  "step": 100
150
  }
151
  ],
 
154
  "num_input_tokens_seen": 0,
155
  "num_train_epochs": 5,
156
  "save_steps": 20,
157
+ "total_flos": 8016985030459392.0,
158
+ "train_batch_size": 4,
159
  "trial_name": null,
160
  "trial_params": null
161
  }
checkpoint-100/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcd55fa32c983512f289bcc506b75cd6687379a244a95f246ddb3cda8a97ea11
3
  size 4960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
3
  size 4960
checkpoint-20/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81169049628740146f07ac7e1f2c59f6475467aa39b48ea79092281b1cf4f31f
3
- size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36fe3540199ef008a5a14901c68ed9f51bd9b4d479e2c02c0a2a74bf9ab9c08d
3
+ size 83946192
checkpoint-20/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b5c5a5f5f27d9ba6aebc06e9fb489f05042a6881eeeddbd130172ad2723e6c5
3
  size 335810482
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba79b0978d0a6742535ddd213b0cf9c7209903a3c55809669cb1c9d19ccf9abc
3
  size 335810482
checkpoint-20/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c38429496778665cedc2e268e56dc0476144498310916d1f0cfff08c093b6b5c
3
  size 14168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:596bb0c43f17f8cd0971123a502b06f192d7a434146d9d5e3e84fb081424cc46
3
  size 14168
checkpoint-20/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce642ef0996d1e3d6618dd62493c8bbb08bd03811c25a654671f6952e68b2cd2
3
  size 1056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2256134ffb225a6c790a5cebe9b44be1002bae7b80db44f96adfc6030072a13c
3
  size 1056
checkpoint-20/trainer_state.json CHANGED
@@ -10,30 +10,30 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
- "grad_norm": 5.041278839111328,
14
  "learning_rate": 2.9968542393565676e-06,
15
- "loss": 1.9371,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.41,
20
- "grad_norm": NaN,
21
- "learning_rate": 2.9717672653473587e-06,
22
- "loss": 1.806,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.61,
27
- "grad_norm": 3.5015058517456055,
28
- "learning_rate": 2.905810057509516e-06,
29
- "loss": 1.7373,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.82,
34
- "grad_norm": 3.583218574523926,
35
- "learning_rate": 2.803067604777227e-06,
36
- "loss": 1.7471,
37
  "step": 20
38
  }
39
  ],
@@ -42,8 +42,8 @@
42
  "num_input_tokens_seen": 0,
43
  "num_train_epochs": 5,
44
  "save_steps": 20,
45
- "total_flos": 1729235156533248.0,
46
- "train_batch_size": 8,
47
  "trial_name": null,
48
  "trial_params": null
49
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
+ "grad_norm": 5.09375,
14
  "learning_rate": 2.9968542393565676e-06,
15
+ "loss": 1.9388,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.41,
20
+ "grad_norm": 4.4375,
21
+ "learning_rate": 2.9616157869703894e-06,
22
+ "loss": 1.8455,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.61,
27
+ "grad_norm": 3.59375,
28
+ "learning_rate": 2.8881318444640566e-06,
29
+ "loss": 1.7796,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.82,
34
+ "grad_norm": 3.796875,
35
+ "learning_rate": 2.778325235483954e-06,
36
+ "loss": 1.8091,
37
  "step": 20
38
  }
39
  ],
 
42
  "num_input_tokens_seen": 0,
43
  "num_train_epochs": 5,
44
  "save_steps": 20,
45
+ "total_flos": 1604332476923904.0,
46
+ "train_batch_size": 4,
47
  "trial_name": null,
48
  "trial_params": null
49
  }
checkpoint-20/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcd55fa32c983512f289bcc506b75cd6687379a244a95f246ddb3cda8a97ea11
3
  size 4960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
3
  size 4960
checkpoint-40/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:201e57d1be1961bebbcd1ff8eda8c6677e4a69421937df43b308c971c3108aea
3
- size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab1d62447cec6baff4b897987488e96f3e155ae7a2d789c9bfd44df3352413bd
3
+ size 83946192
checkpoint-40/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:079e8a89a0226983e6e8ae22966abf6aed4e5ebf6d41cb8dab776e531a5362f7
3
  size 335810482
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb9a12580e8a534dab4a9273a9183c0f3caabe7de1e89240f077919754dc8398
3
  size 335810482
checkpoint-40/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0230bfed5d2d4ba948095836587d23305d972aac689f5cdcf2fefea079cce46
3
  size 14168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:076780903adeff97a16d6f25588ee658c45903a68edf79adba5d800f18428061
3
  size 14168
checkpoint-40/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:701d4e651a2909e8f790218a3f92bf4c4a7e6e82039f18b093a26a63dfa65d6a
3
  size 1056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaf7ad3bbb37486a5e7658ddf05bfee6df67222659155f89c9b8f68d5f791717
3
  size 1056
checkpoint-40/trainer_state.json CHANGED
@@ -10,58 +10,58 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
- "grad_norm": 5.041278839111328,
14
  "learning_rate": 2.9968542393565676e-06,
15
- "loss": 1.9371,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.41,
20
- "grad_norm": NaN,
21
- "learning_rate": 2.9717672653473587e-06,
22
- "loss": 1.806,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.61,
27
- "grad_norm": 3.5015058517456055,
28
- "learning_rate": 2.905810057509516e-06,
29
- "loss": 1.7373,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.82,
34
- "grad_norm": 3.583218574523926,
35
- "learning_rate": 2.803067604777227e-06,
36
- "loss": 1.7471,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.02,
41
- "grad_norm": 4.409696102142334,
42
- "learning_rate": 2.666228326019474e-06,
43
- "loss": 1.6622,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 1.22,
48
- "grad_norm": 3.5655429363250732,
49
- "learning_rate": 2.498872837517522e-06,
50
- "loss": 1.6483,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 1.43,
55
- "grad_norm": 4.161169528961182,
56
- "learning_rate": 2.305380260473476e-06,
57
- "loss": 1.674,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 1.63,
62
- "grad_norm": 3.5801784992218018,
63
- "learning_rate": 2.090813634373931e-06,
64
- "loss": 1.6635,
65
  "step": 40
66
  }
67
  ],
@@ -70,8 +70,8 @@
70
  "num_input_tokens_seen": 0,
71
  "num_train_epochs": 5,
72
  "save_steps": 20,
73
- "total_flos": 3471636198850560.0,
74
- "train_batch_size": 8,
75
  "trial_name": null,
76
  "trial_params": null
77
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
+ "grad_norm": 5.09375,
14
  "learning_rate": 2.9968542393565676e-06,
15
+ "loss": 1.9388,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.41,
20
+ "grad_norm": 4.4375,
21
+ "learning_rate": 2.9616157869703894e-06,
22
+ "loss": 1.8455,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.61,
27
+ "grad_norm": 3.59375,
28
+ "learning_rate": 2.8881318444640566e-06,
29
+ "loss": 1.7796,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.82,
34
+ "grad_norm": 3.796875,
35
+ "learning_rate": 2.778325235483954e-06,
36
+ "loss": 1.8091,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.02,
41
+ "grad_norm": 4.625,
42
+ "learning_rate": 2.6350692237265428e-06,
43
+ "loss": 1.7224,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 1.22,
48
+ "grad_norm": 3.953125,
49
+ "learning_rate": 2.4621123294467098e-06,
50
+ "loss": 1.7108,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 1.43,
55
+ "grad_norm": 4.125,
56
+ "learning_rate": 2.2639802434931445e-06,
57
+ "loss": 1.7299,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 1.63,
62
+ "grad_norm": 3.625,
63
+ "learning_rate": 2.0458574054452316e-06,
64
+ "loss": 1.7111,
65
  "step": 40
66
  }
67
  ],
 
70
  "num_input_tokens_seen": 0,
71
  "num_train_epochs": 5,
72
  "save_steps": 20,
73
+ "total_flos": 3199050392518656.0,
74
+ "train_batch_size": 4,
75
  "trial_name": null,
76
  "trial_params": null
77
  }
checkpoint-40/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcd55fa32c983512f289bcc506b75cd6687379a244a95f246ddb3cda8a97ea11
3
  size 4960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
3
  size 4960
checkpoint-60/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffdddd99b7e2dac2fbe6f1527394f3bd35c8f4cec7f824d6a680b150eab29752
3
- size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b479837a2f06b820e0f0f448720fe1bd18ae1237586386c794257aec6a6aef
3
+ size 83946192
checkpoint-60/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2ca205f6859d421a936cd43599ebb3a9e6b4746a4617aec3eb1e4451973a6ab
3
  size 335810482
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc5a078967486289c7bb2846375d3eb6741292205506fe5dde1c1a54fd203bf7
3
  size 335810482
checkpoint-60/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6a26fff3a3d4ce3ae0cf2b12423f9b87839511db578ab2145a0af9abb15ae98
3
  size 14168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fe60fa4f0a2e1e83d88e7fff8399bc59bb2f8ab03ac19802ae736a7dbddb571
3
  size 14168
checkpoint-60/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66ff6bbcef17af5c104327b1fe8909f5fc08bf691eb358ca5fd99210a287b128
3
  size 1056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d151571d18043fd71e8a555e30e88db47c2b573382d14ae55c5df3ad38604822
3
  size 1056
checkpoint-60/trainer_state.json CHANGED
@@ -10,86 +10,86 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
- "grad_norm": 5.041278839111328,
14
  "learning_rate": 2.9968542393565676e-06,
15
- "loss": 1.9371,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.41,
20
- "grad_norm": NaN,
21
- "learning_rate": 2.9717672653473587e-06,
22
- "loss": 1.806,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.61,
27
- "grad_norm": 3.5015058517456055,
28
- "learning_rate": 2.905810057509516e-06,
29
- "loss": 1.7373,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.82,
34
- "grad_norm": 3.583218574523926,
35
- "learning_rate": 2.803067604777227e-06,
36
- "loss": 1.7471,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.02,
41
- "grad_norm": 4.409696102142334,
42
- "learning_rate": 2.666228326019474e-06,
43
- "loss": 1.6622,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 1.22,
48
- "grad_norm": 3.5655429363250732,
49
- "learning_rate": 2.498872837517522e-06,
50
- "loss": 1.6483,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 1.43,
55
- "grad_norm": 4.161169528961182,
56
- "learning_rate": 2.305380260473476e-06,
57
- "loss": 1.674,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 1.63,
62
- "grad_norm": 3.5801784992218018,
63
- "learning_rate": 2.090813634373931e-06,
64
- "loss": 1.6635,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 1.84,
69
- "grad_norm": 3.7053236961364746,
70
- "learning_rate": 1.8607874345493807e-06,
71
- "loss": 1.6857,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 2.04,
76
- "grad_norm": 3.145810127258301,
77
- "learning_rate": 1.6213206605421064e-06,
78
- "loss": 1.6165,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 2.24,
83
- "grad_norm": 3.514664649963379,
84
- "learning_rate": 1.3786793394578939e-06,
85
- "loss": 1.6025,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 2.45,
90
- "grad_norm": 3.9232802391052246,
91
- "learning_rate": 1.13921256545062e-06,
92
- "loss": 1.5615,
93
  "step": 60
94
  }
95
  ],
@@ -98,8 +98,8 @@
98
  "num_input_tokens_seen": 0,
99
  "num_train_epochs": 5,
100
  "save_steps": 20,
101
- "total_flos": 5264188871884800.0,
102
- "train_batch_size": 8,
103
  "trial_name": null,
104
  "trial_params": null
105
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
+ "grad_norm": 5.09375,
14
  "learning_rate": 2.9968542393565676e-06,
15
+ "loss": 1.9388,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.41,
20
+ "grad_norm": 4.4375,
21
+ "learning_rate": 2.9616157869703894e-06,
22
+ "loss": 1.8455,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.61,
27
+ "grad_norm": 3.59375,
28
+ "learning_rate": 2.8881318444640566e-06,
29
+ "loss": 1.7796,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.82,
34
+ "grad_norm": 3.796875,
35
+ "learning_rate": 2.778325235483954e-06,
36
+ "loss": 1.8091,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.02,
41
+ "grad_norm": 4.625,
42
+ "learning_rate": 2.6350692237265428e-06,
43
+ "loss": 1.7224,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 1.22,
48
+ "grad_norm": 3.953125,
49
+ "learning_rate": 2.4621123294467098e-06,
50
+ "loss": 1.7108,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 1.43,
55
+ "grad_norm": 4.125,
56
+ "learning_rate": 2.2639802434931445e-06,
57
+ "loss": 1.7299,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 1.63,
62
+ "grad_norm": 3.625,
63
+ "learning_rate": 2.0458574054452316e-06,
64
+ "loss": 1.7111,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 1.84,
69
+ "grad_norm": 3.53125,
70
+ "learning_rate": 1.813451344546913e-06,
71
+ "loss": 1.7364,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 2.04,
76
+ "grad_norm": 3.1875,
77
+ "learning_rate": 1.5728433331716726e-06,
78
+ "loss": 1.6664,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 2.24,
83
+ "grad_norm": 3.125,
84
+ "learning_rate": 1.3303292607070737e-06,
85
+ "loss": 1.6673,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 2.45,
90
+ "grad_norm": 3.65625,
91
+ "learning_rate": 1.0922548916454855e-06,
92
+ "loss": 1.6219,
93
  "step": 60
94
  }
95
  ],
 
98
  "num_input_tokens_seen": 0,
99
  "num_train_epochs": 5,
100
  "save_steps": 20,
101
+ "total_flos": 4847990969303040.0,
102
+ "train_batch_size": 4,
103
  "trial_name": null,
104
  "trial_params": null
105
  }
checkpoint-60/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcd55fa32c983512f289bcc506b75cd6687379a244a95f246ddb3cda8a97ea11
3
  size 4960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
3
  size 4960
checkpoint-80/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4be7974723249f007017e295554c31f7a342db52a8eba5f7a93c2a7a155e020
3
- size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2072d0773bb7a3b3ac9c1b62ccf0be9fb811367ea38d87012e0fe0cd0fa4c99
3
+ size 83946192
checkpoint-80/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3bc3157b40e30f2fda6a7e7023a977eaccfd60bedd3515639ec519017b2ed43f
3
  size 335810482
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a3436433c51ee5552f4562b6706190e1b0b15ef3f5a6d76cc552fa0d29e0c6f
3
  size 335810482
checkpoint-80/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abd6efaf95891e0f58baa8477c4de71a0b71c10ed67610f5cabdc07e838de207
3
  size 14168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39b06804188097f3c1bab9da775567b42104978432eb2ab2e415e5e56cb71c34
3
  size 14168
checkpoint-80/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bee8a0b8f96a0f78af76b5c0179274ad985c4e5aff73d9a192518417414c17d8
3
  size 1056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9a8053fc960722f2783f69c7ab9c51a9ad21ea164a693d17b94cded06192073
3
  size 1056
checkpoint-80/trainer_state.json CHANGED
@@ -10,114 +10,114 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
- "grad_norm": 5.041278839111328,
14
  "learning_rate": 2.9968542393565676e-06,
15
- "loss": 1.9371,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.41,
20
- "grad_norm": NaN,
21
- "learning_rate": 2.9717672653473587e-06,
22
- "loss": 1.806,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.61,
27
- "grad_norm": 3.5015058517456055,
28
- "learning_rate": 2.905810057509516e-06,
29
- "loss": 1.7373,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.82,
34
- "grad_norm": 3.583218574523926,
35
- "learning_rate": 2.803067604777227e-06,
36
- "loss": 1.7471,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.02,
41
- "grad_norm": 4.409696102142334,
42
- "learning_rate": 2.666228326019474e-06,
43
- "loss": 1.6622,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 1.22,
48
- "grad_norm": 3.5655429363250732,
49
- "learning_rate": 2.498872837517522e-06,
50
- "loss": 1.6483,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 1.43,
55
- "grad_norm": 4.161169528961182,
56
- "learning_rate": 2.305380260473476e-06,
57
- "loss": 1.674,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 1.63,
62
- "grad_norm": 3.5801784992218018,
63
- "learning_rate": 2.090813634373931e-06,
64
- "loss": 1.6635,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 1.84,
69
- "grad_norm": 3.7053236961364746,
70
- "learning_rate": 1.8607874345493807e-06,
71
- "loss": 1.6857,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 2.04,
76
- "grad_norm": 3.145810127258301,
77
- "learning_rate": 1.6213206605421064e-06,
78
- "loss": 1.6165,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 2.24,
83
- "grad_norm": 3.514664649963379,
84
- "learning_rate": 1.3786793394578939e-06,
85
- "loss": 1.6025,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 2.45,
90
- "grad_norm": 3.9232802391052246,
91
- "learning_rate": 1.13921256545062e-06,
92
- "loss": 1.5615,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 2.65,
97
- "grad_norm": 3.0508198738098145,
98
- "learning_rate": 9.091863656260696e-07,
99
- "loss": 1.6076,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 2.86,
104
- "grad_norm": 3.7754147052764893,
105
- "learning_rate": 6.946197395265243e-07,
106
- "loss": 1.664,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 3.06,
111
- "grad_norm": 3.5631840229034424,
112
- "learning_rate": 5.011271624824787e-07,
113
- "loss": 1.5976,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 3.27,
118
- "grad_norm": 3.4716637134552,
119
- "learning_rate": 3.337716739805264e-07,
120
- "loss": 1.5761,
121
  "step": 80
122
  }
123
  ],
@@ -126,8 +126,8 @@
126
  "num_input_tokens_seen": 0,
127
  "num_train_epochs": 5,
128
  "save_steps": 20,
129
- "total_flos": 6971682993340416.0,
130
- "train_batch_size": 8,
131
  "trial_name": null,
132
  "trial_params": null
133
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
+ "grad_norm": 5.09375,
14
  "learning_rate": 2.9968542393565676e-06,
15
+ "loss": 1.9388,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.41,
20
+ "grad_norm": 4.4375,
21
+ "learning_rate": 2.9616157869703894e-06,
22
+ "loss": 1.8455,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.61,
27
+ "grad_norm": 3.59375,
28
+ "learning_rate": 2.8881318444640566e-06,
29
+ "loss": 1.7796,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.82,
34
+ "grad_norm": 3.796875,
35
+ "learning_rate": 2.778325235483954e-06,
36
+ "loss": 1.8091,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 1.02,
41
+ "grad_norm": 4.625,
42
+ "learning_rate": 2.6350692237265428e-06,
43
+ "loss": 1.7224,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 1.22,
48
+ "grad_norm": 3.953125,
49
+ "learning_rate": 2.4621123294467098e-06,
50
+ "loss": 1.7108,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 1.43,
55
+ "grad_norm": 4.125,
56
+ "learning_rate": 2.2639802434931445e-06,
57
+ "loss": 1.7299,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 1.63,
62
+ "grad_norm": 3.625,
63
+ "learning_rate": 2.0458574054452316e-06,
64
+ "loss": 1.7111,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 1.84,
69
+ "grad_norm": 3.53125,
70
+ "learning_rate": 1.813451344546913e-06,
71
+ "loss": 1.7364,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 2.04,
76
+ "grad_norm": 3.1875,
77
+ "learning_rate": 1.5728433331716726e-06,
78
+ "loss": 1.6664,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 2.24,
83
+ "grad_norm": 3.125,
84
+ "learning_rate": 1.3303292607070737e-06,
85
+ "loss": 1.6673,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 2.45,
90
+ "grad_norm": 3.65625,
91
+ "learning_rate": 1.0922548916454855e-06,
92
+ "loss": 1.6219,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 2.65,
97
+ "grad_norm": 2.8125,
98
+ "learning_rate": 8.648498186137653e-07,
99
+ "loss": 1.6648,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 2.86,
104
+ "grad_norm": 3.53125,
105
+ "learning_rate": 6.540644552236401e-07,
106
+ "loss": 1.699,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 3.06,
111
+ "grad_norm": 4.15625,
112
+ "learning_rate": 4.6541433408284356e-07,
113
+ "loss": 1.6821,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 3.27,
118
+ "grad_norm": 3.546875,
119
+ "learning_rate": 3.0383578415591913e-07,
120
+ "loss": 1.6633,
121
  "step": 80
122
  }
123
  ],
 
126
  "num_input_tokens_seen": 0,
127
  "num_train_epochs": 5,
128
  "save_steps": 20,
129
+ "total_flos": 6434740059291648.0,
130
+ "train_batch_size": 4,
131
  "trial_name": null,
132
  "trial_params": null
133
  }
checkpoint-80/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcd55fa32c983512f289bcc506b75cd6687379a244a95f246ddb3cda8a97ea11
3
  size 4960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
3
  size 4960
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcd55fa32c983512f289bcc506b75cd6687379a244a95f246ddb3cda8a97ea11
3
  size 4960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23cfefdd62756fac4437632539fdfbb741029e6fb943cafeffe397c21a403a5d
3
  size 4960