Update README.md
Browse files
README.md
CHANGED
@@ -56,11 +56,6 @@ Related models: [Cerebras-GPT Models](https://huggingface.co/models?sort=downloa
|
|
56 |
| Cerebras-GPT | 2.7B | 32 | 2560 | 20 | 128 | 10240 | 2.00E-04 | 528 | 1.08M |
|
57 |
| Cerebras-GPT | 6.7B | 32 | 4096 | 32 | 128 | 16384 | 1.20E-04 | 1040 | 2.13M |
|
58 |
| Cerebras-GPT | 13B | 40 | 5120 | 40 | 128 | 20480 | 1.20E-04 | 720/1080 | 1.47M/2.21M |
|
59 |
-
| Cerebras-GPT-muP | 111M | 10 | 768 | 12 | 64 | 3072 | 6.00E-03 | 120 | 246K |
|
60 |
-
| Cerebras-GPT-muP | 256M | 14 | 1088 | 17 | 64 | 4352 | 6.00E-03 | 264 | 541K |
|
61 |
-
| Cerebras-GPT-muP | 590M | 18 | 1536 | 12 | 128 | 6144 | 6.00E-03 | 264 | 541K |
|
62 |
-
| Cerebras-GPT-muP | 1.3B | 24 | 2048 | 16 | 128 | 8192 | 6.00E-03 | 528 | 1.08M |
|
63 |
-
| Cerebras-GPT-muP | 2.7B | 32 | 2560 | 20 | 128 | 10240 | 6.00E-03 | 528 | 1.08M |
|
64 |
|
65 |
<br><br>
|
66 |
|
@@ -142,24 +137,6 @@ We evaluate our models on the PILE validation set comprising 380M tokens. We als
|
|
142 |
| | 2.7B | 9.8E+20 | 1.834 | 0.386 | 0.701 | 0.559 | 0.567 | 0.571 | 0.246 | 0.206 | 0.462 |
|
143 |
| | 6.7B | 5.9E+21 | TODO | TODO | TODO | TODO | TODO | TODO | TODO | TODO | TODO |
|
144 |
| | 13B | 2.1E+22 | 1.575 | 0.513 | 0.766 | 0.646 | 0.696 | 0.714 | 0.367 | 0.286 | 0.570 |
|
145 |
-
| Pythia | 70M | 1.3E+20 | TODO | 0.270 | 0.590 | 0.491 | 0.259 | 0.413 | 0.185 | 0.132 | 0.334 |
|
146 |
-
| | 160M | 3.4E+20 | TODO | 0.293 | 0.627 | 0.519 | 0.389 | 0.452 | 0.181 | 0.160 | 0.375 |
|
147 |
-
| | 410M | 8.8E+20 | TODO | 0.333 | 0.668 | 0.530 | 0.505 | 0.504 | 0.213 | 0.178 | 0.419 |
|
148 |
-
| | 1B | 2.0E+21 | TODO | 0.376 | 0.705 | 0.545 | 0.566 | 0.559 | 0.243 | 0.196 | 0.456 |
|
149 |
-
| | 2.8B | 5.5E+21 | TODO | 0.451 | 0.737 | 0.612 | 0.654 | 0.629 | 0.288 | 0.220 | 0.513 |
|
150 |
-
| | 6.9B | 2.6E+22 | TODO | 0.482 | 0.746 | 0.611 | 0.679 | 0.669 | 0.323 | 0.270 | 0.540 |
|
151 |
-
| | 12B | 9.0E+22 | TODO | 0.505 | 0.761 | 0.645 | 0.705 | 0.700 | 0.336 | 0.284 | 0.562 |
|
152 |
-
| NeoX | 20B | 6.1E+22 | TODO | 0.535 | 0.779 | 0.661 | 0.720 | 0.723 | 0.380 | 0.290 | 0.584 |
|
153 |
-
| GPTJ | 6B | 1.5E+22 | TODO | 0.518 | 0.752 | 0.640 | 0.683 | 0.670 | 0.340 | 0.288 | 0.556 |
|
154 |
-
| OPT | 125M | 3.4E+20 | - | 0.292 | 0.630 | 0.503 | 0.379 | 0.435 | 0.189 | 0.166 | 0.371 |
|
155 |
-
| | 350M | 8.8E+20 | - | 0.320 | 0.644 | 0.523 | 0.452 | 0.440 | 0.207 | 0.176 | 0.395 |
|
156 |
-
| | 1.3B | 2.8E+21 | - | 0.415 | 0.717 | 0.595 | 0.579 | 0.570 | 0.234 | 0.234 | 0.478 |
|
157 |
-
| | 2.7B | 5.5E+21 | - | 0.458 | 0.738 | 0.610 | 0.637 | 0.609 | 0.268 | 0.250 | 0.510 |
|
158 |
-
| | 6.7B | 1.3E+22 | - | 0.505 | 0.763 | 0.654 | 0.677 | 0.656 | 0.307 | 0.276 | 0.548 |
|
159 |
-
| | 13B | 2.5E+22 | - | 0.524 | 0.759 | 0.651 | 0.687 | 0.671 | 0.329 | 0.270 | 0.556 |
|
160 |
-
| LLaMA | 6.7B | | 0.761 | 0.798 | 0.701 | | 0.728 | 0.476 | 0.572 | | |
|
161 |
-
| | 13B | | 0.792 | 0.801 | 0.730 | | 0.748 | 0.527 | 0.564 | | |
|
162 |
-
|
163 |
|
164 |
#### 5-shot Evaluation
|
165 |
| Model | Count | Hella-Swag | PIQA | Wino-Grande | Lambada | ARC-e | ARC-c | OpenBookQA |
|
@@ -171,22 +148,7 @@ We evaluate our models on the PILE validation set comprising 380M tokens. We als
|
|
171 |
| | 2.7B | 0.382 | 0.697 | 0.543 | 0.487 | 0.590 | 0.267 | 0.224 |
|
172 |
| | 6.7B | TODO | TODO | TODO | TODO | TODO | TODO | TODO |
|
173 |
| | 13B | 0.514 | 0.768 | 0.674 | 0.655 | 0.743 | 0.398 | 0.318 |
|
174 |
-
|
175 |
-
| | 160M | 0.292 | 0.631 | 0.515 | 0.329 | 0.469 | 0.205 | 0.164 |
|
176 |
-
| | 410M | 0.333 | 0.669 | 0.522 | 0.448 | 0.526 | 0.229 | 0.188 |
|
177 |
-
| | 1B | 0.374 | 0.709 | 0.562 | 0.514 | 0.596 | 0.265 | 0.206 |
|
178 |
-
| | 1.4B | 0.398 | 0.712 | 0.573 | 0.553 | 0.622 | 0.274 | 0.214 |
|
179 |
-
| | 2.8B | 0.448 | 0.738 | 0.621 | 0.629 | 0.673 | 0.328 | 0.254 |
|
180 |
-
| | 6.9B | 0.478 | 0.750 | 0.646 | 0.641 | 0.699 | 0.355 | 0.296 |
|
181 |
-
| | 12B | 0.506 | 0.759 | 0.662 | 0.673 | 0.731 | 0.383 | 0.322 |
|
182 |
-
| NeoX | 20B | 0.538 | 0.774 | 0.683 | 0.698 | 0.746 | 0.410 | 0.326 |
|
183 |
-
| GPTJ | 6B | 0.494 | 0.756 | 0.660 | 0.662 | 0.705 | 0.360 | 0.310 |
|
184 |
-
| OPT | 125M | 0.289 | 0.628 | 0.520 | 0.303 | 0.426 | 0.197 | 0.166 |
|
185 |
-
| | 350M | 0.321 | 0.647 | 0.521 | 0.384 | 0.464 | 0.208 | 0.184 |
|
186 |
-
| | 1.3B | 0.413 | 0.726 | 0.597 | 0.553 | 0.604 | 0.273 | 0.230 |
|
187 |
-
| | 2.7B | 0.458 | 0.749 | 0.616 | 0.603 | 0.651 | 0.305 | 0.276 |
|
188 |
-
| | 6.7B | 0.505 | 0.773 | 0.663 | 0.660 | 0.692 | 0.340 | 0.318 |
|
189 |
-
| | 13B | 0.524 | 0.763 | 0.684 | 0.678 | 0.714 | 0.358 | 0.306 |
|
190 |
|
191 |
<br><br>
|
192 |
|
|
|
56 |
| Cerebras-GPT | 2.7B | 32 | 2560 | 20 | 128 | 10240 | 2.00E-04 | 528 | 1.08M |
|
57 |
| Cerebras-GPT | 6.7B | 32 | 4096 | 32 | 128 | 16384 | 1.20E-04 | 1040 | 2.13M |
|
58 |
| Cerebras-GPT | 13B | 40 | 5120 | 40 | 128 | 20480 | 1.20E-04 | 720/1080 | 1.47M/2.21M |
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
<br><br>
|
61 |
|
|
|
137 |
| | 2.7B | 9.8E+20 | 1.834 | 0.386 | 0.701 | 0.559 | 0.567 | 0.571 | 0.246 | 0.206 | 0.462 |
|
138 |
| | 6.7B | 5.9E+21 | TODO | TODO | TODO | TODO | TODO | TODO | TODO | TODO | TODO |
|
139 |
| | 13B | 2.1E+22 | 1.575 | 0.513 | 0.766 | 0.646 | 0.696 | 0.714 | 0.367 | 0.286 | 0.570 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
#### 5-shot Evaluation
|
142 |
| Model | Count | Hella-Swag | PIQA | Wino-Grande | Lambada | ARC-e | ARC-c | OpenBookQA |
|
|
|
148 |
| | 2.7B | 0.382 | 0.697 | 0.543 | 0.487 | 0.590 | 0.267 | 0.224 |
|
149 |
| | 6.7B | TODO | TODO | TODO | TODO | TODO | TODO | TODO |
|
150 |
| | 13B | 0.514 | 0.768 | 0.674 | 0.655 | 0.743 | 0.398 | 0.318 |
|
151 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
<br><br>
|
154 |
|