slimfrikha-tii commited on
Commit
6b4ddd0
·
verified ·
1 Parent(s): 116adab

update benchs

Browse files
Files changed (1) hide show
  1. README.md +37 -44
README.md CHANGED
@@ -117,97 +117,90 @@ We report in the following table our internal pipeline benchmarks.
117
  <tr>
118
  <td rowspan="3">General</td>
119
  <td>MMLU (5-shot)</td>
120
- <td>23.4</td>
121
- <td><b>58.4</b></td>
122
- <td>48.4</td>
123
- <td>43.9</td>
124
  </tr>
125
  <tr>
126
  <td>MMLU-PRO (5-shot)</td>
127
- <td>11.3</td>
128
- <td><b>21.3</b></td>
129
- <td>17.2</td>
130
  <td>18.6</td>
131
  </tr>
132
  <tr>
133
  <td>IFEval</td>
134
- <td><b>55.8</b></td>
135
- <td>44.4</td>
136
- <td>53.0</td>
137
  <td>54.4</td>
138
  </tr>
139
  <tr>
140
  <td rowspan="3">Math</td>
141
  <td>GSM8K (5-shot)</td>
142
- <td>37.4</td>
143
- <td><b>57.2</b></td>
144
- <td>43.4</td>
145
- <td>38.6</td>
146
  </tr>
147
  <tr>
148
  <td>GSM8K (8-shot, COT)</td>
149
- <td>35.6</td>
150
- <td><b>62.2</b></td>
151
- <td>47.2</td>
152
- <td>41.8</td>
153
  </tr>
154
  <tr>
155
  <td>MATH Lvl-5 (4-shot)</td>
156
- <td><b>3.9</b></td>
157
- <td>0.2</td>
158
- <td>0.1</td>
159
- <td>1.0</td>
160
  </tr>
161
  <tr>
162
  <td rowspan="6">Reasoning</td>
163
  <td>Arc Challenge (25-shot)</td>
164
- <td>34.1</td>
165
- <td>47.0</td>
166
- <td><b>47.6</b></td>
167
- <td>45.9</td>
168
  </tr>
169
  <tr>
170
  <td>GPQA (0-shot)</td>
171
- <td>25.3</td>
172
  <td><b>29.6</b></td>
173
- <td>28.7</td>
174
  <td>26.5</td>
175
  </tr>
176
  <tr>
177
  <td>GPQA (0-shot, COT)</td>
178
  <td>13.2</td>
179
  <td>9.2</td>
180
- <td>16.0</td>
181
  <td><b>21.3</b></td>
182
  </tr>
183
  <tr>
184
  <td>MUSR (0-shot)</td>
185
- <td>32.4</td>
186
- <td>36.8</td>
187
- <td>33.0</td>
188
  <td><b>40.7</b></td>
189
  </tr>
190
  <tr>
191
  <td>BBH (3-shot)</td>
192
- <td>30.3</td>
193
- <td><b>38.5</b></td>
194
- <td>33.1</td>
195
  <td>35.1</td>
196
  </tr>
197
- <tr>
198
- <td>BBH (3-shot, COT)</td>
199
- <td>0.0</td>
200
- <td>20.3</td>
201
- <td>0.8</td>
202
- <td><b>30.5</b></td>
203
- </tr>
204
  <tr>
205
  <td rowspan="5">CommonSense Understanding</td>
206
  <td>PIQA (0-shot)</td>
207
  <td>72.1</td>
208
  <td>73.2</td>
209
  <td><b>74.4</b></td>
210
- <td>72.0</td>
211
  </tr>
212
  <tr>
213
  <td>SciQ (0-shot)</td>
@@ -228,7 +221,7 @@ We report in the following table our internal pipeline benchmarks.
228
  <td>40.2</td>
229
  <td>40.4</td>
230
  <td><b>42.8</b></td>
231
- <td>40.0</td>
232
  </tr>
233
  <tr>
234
  <td>MT-Bench (avg)</td>
 
117
  <tr>
118
  <td rowspan="3">General</td>
119
  <td>MMLU (5-shot)</td>
120
+ <td><b>68.2</b></td>
121
+ <td>59.8<</td>
122
+ <td>49.2</td>
123
+ <td>46.1</td>
124
  </tr>
125
  <tr>
126
  <td>MMLU-PRO (5-shot)</td>
127
+ <td>16</td>
128
+ <td><b>28.2</b></td>
129
+ <td>20</td>
130
  <td>18.6</td>
131
  </tr>
132
  <tr>
133
  <td>IFEval</td>
134
+ <td><b>55.3</b></td>
135
+ <td>44.2</td>
136
+ <td>53</td>
137
  <td>54.4</td>
138
  </tr>
139
  <tr>
140
  <td rowspan="3">Math</td>
141
  <td>GSM8K (5-shot)</td>
142
+ <td><b>82.6</b></td>
143
+ <td>57.8</td>
144
+ <td>47.6</td>
145
+ <td>43.9</td>
146
  </tr>
147
  <tr>
148
  <td>GSM8K (8-shot, COT)</td>
149
+ <td>46.6</td>
150
+ <td><b>58.8</b></td>
151
+ <td>46.3</td>
152
+ <td>45.8</td>
153
  </tr>
154
  <tr>
155
  <td>MATH Lvl-5 (4-shot)</td>
156
+ <td><b>5.2</b></td>
157
+ <td>1.1</td>
158
+ <td>3.1</td>
159
+ <td>1</td>
160
  </tr>
161
  <tr>
162
  <td rowspan="6">Reasoning</td>
163
  <td>Arc Challenge (25-shot)</td>
164
+ <td>58.6</td>
165
+ <td>50.7</td>
166
+ <td><b>49.7</b></td>
167
+ <td>47.7</td>
168
  </tr>
169
  <tr>
170
  <td>GPQA (0-shot)</td>
171
+ <td>24.4</td>
172
  <td><b>29.6</b></td>
173
+ <td>28.6</td>
174
  <td>26.5</td>
175
  </tr>
176
  <tr>
177
  <td>GPQA (0-shot, COT)</td>
178
  <td>13.2</td>
179
  <td>9.2</td>
180
+ <td>16</td>
181
  <td><b>21.3</b></td>
182
  </tr>
183
  <tr>
184
  <td>MUSR (0-shot)</td>
185
+ <td>32</td>
186
+ <td>36.5</td>
187
+ <td>32.9</td>
188
  <td><b>40.7</b></td>
189
  </tr>
190
  <tr>
191
  <td>BBH (3-shot)</td>
192
+ <td>33.8</td>
193
+ <td><b>39.2</b></td>
194
+ <td>34</td>
195
  <td>35.1</td>
196
  </tr>
 
 
 
 
 
 
 
197
  <tr>
198
  <td rowspan="5">CommonSense Understanding</td>
199
  <td>PIQA (0-shot)</td>
200
  <td>72.1</td>
201
  <td>73.2</td>
202
  <td><b>74.4</b></td>
203
+ <td>72</td>
204
  </tr>
205
  <tr>
206
  <td>SciQ (0-shot)</td>
 
221
  <td>40.2</td>
222
  <td>40.4</td>
223
  <td><b>42.8</b></td>
224
+ <td>40</td>
225
  </tr>
226
  <tr>
227
  <td>MT-Bench (avg)</td>