Spaces:
Running
Running
sync format with github repo exported results
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- static/eval_results/Default/Aquila_VL_2B/summary_results.json +2 -4
- static/eval_results/Default/Aria/summary_results.json +2 -4
- static/eval_results/Default/Claude_3.5/summary_results.json +2 -4
- static/eval_results/Default/Claude_3.5_new/summary_results.json +2 -4
- static/eval_results/Default/GPT_4o/summary_results.json +2 -4
- static/eval_results/Default/GPT_4o_mini/summary_results.json +2 -4
- static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json +2 -4
- static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json +2 -4
- static/eval_results/Default/Idefics3/summary_results.json +2 -4
- static/eval_results/Default/InternVL2_2B/summary_results.json +2 -4
- static/eval_results/Default/InternVL2_5_2B/summary_results.json +2 -4
- static/eval_results/Default/InternVL2_5_78B/summary_results.json +2 -4
- static/eval_results/Default/InternVL2_76B/summary_results.json +2 -4
- static/eval_results/Default/InternVL2_8B/summary_results.json +2 -4
- static/eval_results/Default/Llama_3_2_11B/summary_results.json +2 -4
- static/eval_results/Default/Mammoth_VL/summary_results.json +2 -4
- static/eval_results/Default/MiniCPM_v2.6/summary_results.json +2 -4
- static/eval_results/Default/NVLM/summary_results.json +2 -4
- static/eval_results/Default/Phi-3.5-vision/summary_results.json +2 -4
- static/eval_results/Default/Pixtral_12B/summary_results.json +2 -4
- static/eval_results/Default/Qwen2_VL_2B/summary_results.json +2 -4
- static/eval_results/Default/Qwen2_VL_72B/summary_results.json +2 -4
- static/eval_results/Default/Qwen2_VL_7B/summary_results.json +2 -4
- static/eval_results/Default/llava_onevision_72B/summary_results.json +2 -4
- static/eval_results/Default/llava_onevision_7B/summary_results.json +2 -4
- static/eval_results/SI/Aquila_VL_2B/summary_results.json +0 -2
- static/eval_results/SI/Aria/summary_results.json +0 -2
- static/eval_results/SI/Claude_3.5/summary_results.json +2 -4
- static/eval_results/SI/Claude_3.5_new/summary_results.json +2 -4
- static/eval_results/SI/GPT_4o/summary_results.json +2 -4
- static/eval_results/SI/GPT_4o_mini/summary_results.json +2 -4
- static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json +2 -4
- static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json +2 -4
- static/eval_results/SI/Idefics3/summary_results.json +0 -2
- static/eval_results/SI/InternVL2_2B/summary_results.json +0 -2
- static/eval_results/SI/InternVL2_76B/summary_results.json +0 -2
- static/eval_results/SI/InternVL2_8B/summary_results.json +0 -2
- static/eval_results/SI/Llama_3_2_11B/summary_results.json +0 -2
- static/eval_results/SI/MiniCPM_v2.6/summary_results.json +0 -2
- static/eval_results/SI/Molmo_72B/summary_results.json +0 -2
- static/eval_results/SI/Molmo_7B_D/summary_results.json +0 -2
- static/eval_results/SI/NVLM/summary_results.json +0 -2
- static/eval_results/SI/POINTS_15_7B/summary_results.json +0 -2
- static/eval_results/SI/POINTS_7B/summary_results.json +0 -2
- static/eval_results/SI/Phi-3.5-vision/summary_results.json +0 -2
- static/eval_results/SI/Pixtral_12B/summary_results.json +0 -2
- static/eval_results/SI/Qwen2_VL_2B/summary_results.json +0 -2
- static/eval_results/SI/Qwen2_VL_72B/summary_results.json +0 -2
- static/eval_results/SI/Qwen2_VL_7B/summary_results.json +0 -2
- static/eval_results/SI/SmolVLM/summary_results.json +0 -2
static/eval_results/Default/Aquila_VL_2B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.159970161379836
|
7 |
-
"micro_mean_score": 0.15844711671722148
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.24567572098570653
|
13 |
-
"micro_mean_score": 0.2704213241616509
|
14 |
},
|
15 |
"overall_score": 0.17100157004197775
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.159970161379836
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.24567572098570653
|
|
|
12 |
},
|
13 |
"overall_score": 0.17100157004197775
|
14 |
},
|
static/eval_results/Default/Aria/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.289073788209904
|
7 |
-
"micro_mean_score": 0.2859007507765791
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.5103725263180767
|
13 |
-
"micro_mean_score": 0.5349957007738607
|
14 |
},
|
15 |
"overall_score": 0.31755778420402525
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.289073788209904
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.5103725263180767
|
|
|
12 |
},
|
13 |
"overall_score": 0.31755778420402525
|
14 |
},
|
static/eval_results/Default/Claude_3.5/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.5040975742801586
|
7 |
-
"micro_mean_score": 0.5002259116666758
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.6373907158949892
|
13 |
-
"micro_mean_score": 0.6569647463456579
|
14 |
},
|
15 |
"overall_score": 0.5212541172602853
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.5040975742801586
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.6373907158949892
|
|
|
12 |
},
|
13 |
"overall_score": 0.5212541172602853
|
14 |
},
|
static/eval_results/Default/Claude_3.5_new/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.5259191914020757
|
7 |
-
"micro_mean_score": 0.5230785894131227
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.6563419761104125
|
13 |
-
"micro_mean_score": 0.6724419604471196
|
14 |
},
|
15 |
"overall_score": 0.5427062825031487
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.5259191914020757
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.6563419761104125
|
|
|
12 |
},
|
13 |
"overall_score": 0.5427062825031487
|
14 |
},
|
static/eval_results/Default/GPT_4o/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.5265030595065238
|
7 |
-
"micro_mean_score": 0.5236338521693411
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.6478225794744895
|
13 |
-
"micro_mean_score": 0.665391229578676
|
14 |
},
|
15 |
"overall_score": 0.5421184432647768
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.5265030595065238
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.6478225794744895
|
|
|
12 |
},
|
13 |
"overall_score": 0.5421184432647768
|
14 |
},
|
static/eval_results/Default/GPT_4o_mini/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.40767494558789397
|
7 |
-
"micro_mean_score": 0.40431644154143376
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.586537827213665
|
13 |
-
"micro_mean_score": 0.6133276010318144
|
14 |
},
|
15 |
"overall_score": 0.43069690064863675
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.40767494558789397
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.586537827213665
|
|
|
12 |
},
|
13 |
"overall_score": 0.43069690064863675
|
14 |
},
|
static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.4189319021967416
|
7 |
-
"micro_mean_score": 0.41567515414375245
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.5691365176285039
|
13 |
-
"micro_mean_score": 0.5987532244196045
|
14 |
},
|
15 |
"overall_score": 0.4382651695295427
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.4189319021967416
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.5691365176285039
|
|
|
12 |
},
|
13 |
"overall_score": 0.4382651695295427
|
14 |
},
|
static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.4822473962867704
|
7 |
-
"micro_mean_score": 0.4764805563057179
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.5858190649927173
|
13 |
-
"micro_mean_score": 0.6104901117798793
|
14 |
},
|
15 |
"overall_score": 0.4955784031499121
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.4822473962867704
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.5858190649927173
|
|
|
12 |
},
|
13 |
"overall_score": 0.4955784031499121
|
14 |
},
|
static/eval_results/Default/Idefics3/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.08956972487602757
|
7 |
-
"micro_mean_score": 0.08982225274252693
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.3210866162255635
|
13 |
-
"micro_mean_score": 0.35649183147033553
|
14 |
},
|
15 |
"overall_score": 0.11936892871309657
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.08956972487602757
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.3210866162255635
|
|
|
12 |
},
|
13 |
"overall_score": 0.11936892871309657
|
14 |
},
|
static/eval_results/Default/InternVL2_2B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.13141974398938763
|
7 |
-
"micro_mean_score": 0.13063500716262516
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.23864417043743646
|
13 |
-
"micro_mean_score": 0.24901117798796224
|
14 |
},
|
15 |
"overall_score": 0.14522090778963154
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.13141974398938763
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.23864417043743646
|
|
|
12 |
},
|
13 |
"overall_score": 0.14522090778963154
|
14 |
},
|
static/eval_results/Default/InternVL2_5_2B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.17806821966478364
|
7 |
-
"micro_mean_score": 0.17708809739236367
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.2738430375585404
|
13 |
-
"micro_mean_score": 0.2905417024935512
|
14 |
},
|
15 |
"overall_score": 0.19039567147289096
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.17806821966478364
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.2738430375585404
|
|
|
12 |
},
|
13 |
"overall_score": 0.19039567147289096
|
14 |
},
|
static/eval_results/Default/InternVL2_5_78B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.44132952988532753
|
7 |
-
"micro_mean_score": 0.4397079059379812
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.5538024772749066
|
13 |
-
"micro_mean_score": 0.5776870163370592
|
14 |
},
|
15 |
"overall_score": 0.4558062458859664
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.44132952988532753
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.5538024772749066
|
|
|
12 |
},
|
13 |
"overall_score": 0.4558062458859664
|
14 |
},
|
static/eval_results/Default/InternVL2_76B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.3562710424410931
|
7 |
-
"micro_mean_score": 0.35129859801162616
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.5192997443033639
|
13 |
-
"micro_mean_score": 0.5421324161650903
|
14 |
},
|
15 |
"overall_score": 0.3772549347599992
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.3562710424410931
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.5192997443033639
|
|
|
12 |
},
|
13 |
"overall_score": 0.3772549347599992
|
14 |
},
|
static/eval_results/Default/InternVL2_8B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.25956581776451815
|
7 |
-
"micro_mean_score": 0.2546984460483302
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1165,
|
12 |
-
"macro_mean_score": 0.3978571701460552
|
13 |
-
"micro_mean_score": 0.4108583690987125
|
14 |
},
|
15 |
"overall_score": 0.2773656948037259
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.25956581776451815
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1165,
|
11 |
+
"macro_mean_score": 0.3978571701460552
|
|
|
12 |
},
|
13 |
"overall_score": 0.2773656948037259
|
14 |
},
|
static/eval_results/Default/Llama_3_2_11B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.15999641916771298
|
7 |
-
"micro_mean_score": 0.15809331016967038
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.3173342406187366
|
13 |
-
"micro_mean_score": 0.3487962166809973
|
14 |
},
|
15 |
"overall_score": 0.1802478219287358
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.15999641916771298
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.3173342406187366
|
|
|
12 |
},
|
13 |
"overall_score": 0.1802478219287358
|
14 |
},
|
static/eval_results/Default/Mammoth_VL/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.264052880412689
|
7 |
-
"micro_mean_score": 0.2626894374387823
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.37992668750165337
|
13 |
-
"micro_mean_score": 0.40120378331900275
|
14 |
},
|
15 |
"overall_score": 0.27896733083008046
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.264052880412689
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.37992668750165337
|
|
|
12 |
},
|
13 |
"overall_score": 0.27896733083008046
|
14 |
},
|
static/eval_results/Default/MiniCPM_v2.6/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.22955895202146906
|
7 |
-
"micro_mean_score": 0.22560399396899078
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.41728623355613875
|
13 |
-
"micro_mean_score": 0.43452278589853827
|
14 |
},
|
15 |
"overall_score": 0.2537218694467236
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.22955895202146906
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.41728623355613875
|
|
|
12 |
},
|
13 |
"overall_score": 0.2537218694467236
|
14 |
},
|
static/eval_results/Default/NVLM/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.21589726765847422
|
7 |
-
"micro_mean_score": 0.21406043849932396
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.3478114310231307
|
13 |
-
"micro_mean_score": 0.3947549441100602
|
14 |
},
|
15 |
"overall_score": 0.23287631838857856
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.21589726765847422
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.3478114310231307
|
|
|
12 |
},
|
13 |
"overall_score": 0.23287631838857856
|
14 |
},
|
static/eval_results/Default/Phi-3.5-vision/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.22995297916629392
|
7 |
-
"micro_mean_score": 0.22708502951025372
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.3947914647737769
|
13 |
-
"micro_mean_score": 0.42459157351676696
|
14 |
},
|
15 |
"overall_score": 0.2511698139474551
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.22995297916629392
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.3947914647737769
|
|
|
12 |
},
|
13 |
"overall_score": 0.2511698139474551
|
14 |
},
|
static/eval_results/Default/Pixtral_12B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.31362045151669854
|
7 |
-
"micro_mean_score": 0.3100986209078182
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.4566234428542061
|
13 |
-
"micro_mean_score": 0.4870593293207223
|
14 |
},
|
15 |
"overall_score": 0.33202677713439754
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.31362045151669854
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.4566234428542061
|
|
|
12 |
},
|
13 |
"overall_score": 0.33202677713439754
|
14 |
},
|
static/eval_results/Default/Qwen2_VL_2B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.20877163406364055
|
7 |
-
"micro_mean_score": 0.20561526268932287
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.3154302566225611
|
13 |
-
"micro_mean_score": 0.33856405846947557
|
14 |
},
|
15 |
"overall_score": 0.22249997162072932
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.20877163406364055
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.3154302566225611
|
|
|
12 |
},
|
13 |
"overall_score": 0.22249997162072932
|
14 |
},
|
static/eval_results/Default/Qwen2_VL_72B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.4542376574527161
|
7 |
-
"micro_mean_score": 0.4501201906164793
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.5639771804231668
|
13 |
-
"micro_mean_score": 0.5835339638865004
|
14 |
},
|
15 |
"overall_score": 0.4683625465479226
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.4542376574527161
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.5639771804231668
|
|
|
12 |
},
|
13 |
"overall_score": 0.4683625465479226
|
14 |
},
|
static/eval_results/Default/Qwen2_VL_7B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.3293449599230247
|
7 |
-
"micro_mean_score": 0.325331493515679
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1170,
|
12 |
-
"macro_mean_score": 0.43955105763038577
|
13 |
-
"micro_mean_score": 0.45508547008546996
|
14 |
},
|
15 |
"overall_score": 0.34352990319228904
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.3293449599230247
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1170,
|
11 |
+
"macro_mean_score": 0.43955105763038577
|
|
|
12 |
},
|
13 |
"overall_score": 0.34352990319228904
|
14 |
},
|
static/eval_results/Default/llava_onevision_72B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.2974368415462532
|
7 |
-
"micro_mean_score": 0.2956217833156672
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.4599484231632498
|
13 |
-
"micro_mean_score": 0.4850386930352536
|
14 |
},
|
15 |
"overall_score": 0.31835417383358944
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.2974368415462532
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.4599484231632498
|
|
|
12 |
},
|
13 |
"overall_score": 0.31835417383358944
|
14 |
},
|
static/eval_results/Default/llava_onevision_7B/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
-
"macro_mean_score": 0.21362697219149712
|
7 |
-
"micro_mean_score": 0.21073910058505504
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 65,
|
11 |
"num_eval_samples": 1163,
|
12 |
-
"macro_mean_score": 0.33979975321921935
|
13 |
-
"micro_mean_score": 0.36474634565778147
|
14 |
},
|
15 |
"overall_score": 0.2298670331158574
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 440,
|
5 |
"num_eval_samples": 6539,
|
6 |
+
"macro_mean_score": 0.21362697219149712
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 65,
|
10 |
"num_eval_samples": 1163,
|
11 |
+
"macro_mean_score": 0.33979975321921935
|
|
|
12 |
},
|
13 |
"overall_score": 0.2298670331158574
|
14 |
},
|
static/eval_results/SI/Aquila_VL_2B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.20770364903712493,
|
8 |
-
"micro_mean_score": 0.20333142638522636,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.31474202723571276,
|
16 |
-
"micro_mean_score": 0.3326568265682657,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.22197543279693666
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.20770364903712493,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.31474202723571276,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.22197543279693666
|
static/eval_results/SI/Aria/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.3178882776147889,
|
8 |
-
"micro_mean_score": 0.3101511832828904,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.5137437248005172,
|
16 |
-
"micro_mean_score": 0.5472939729397295,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.34400233723955265
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.3178882776147889,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.5137437248005172,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.34400233723955265
|
static/eval_results/SI/Claude_3.5/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
-
"macro_mean_score": 0.520276385877485
|
7 |
-
"micro_mean_score": 0.5148202137998056
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 42,
|
11 |
"num_eval_samples": 813,
|
12 |
-
"macro_mean_score": 0.6479684260295507
|
13 |
-
"micro_mean_score": 0.6801968019680197
|
14 |
},
|
15 |
"overall_score": 0.5373019912310938
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
+
"macro_mean_score": 0.520276385877485
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 42,
|
10 |
"num_eval_samples": 813,
|
11 |
+
"macro_mean_score": 0.6479684260295507
|
|
|
12 |
},
|
13 |
"overall_score": 0.5373019912310938
|
14 |
},
|
static/eval_results/SI/Claude_3.5_new/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
-
"macro_mean_score": 0.5462752278980763
|
7 |
-
"micro_mean_score": 0.5417881438289601
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 42,
|
11 |
"num_eval_samples": 813,
|
12 |
-
"macro_mean_score": 0.6764020657053476
|
13 |
-
"micro_mean_score": 0.6924969249692496
|
14 |
},
|
15 |
"overall_score": 0.5636254729390457
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
+
"macro_mean_score": 0.5462752278980763
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 42,
|
10 |
"num_eval_samples": 813,
|
11 |
+
"macro_mean_score": 0.6764020657053476
|
|
|
12 |
},
|
13 |
"overall_score": 0.5636254729390457
|
14 |
},
|
static/eval_results/SI/GPT_4o/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
-
"macro_mean_score": 0.5529953662872719
|
7 |
-
"micro_mean_score": 0.5483479105928085
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 42,
|
11 |
"num_eval_samples": 813,
|
12 |
-
"macro_mean_score": 0.6600228904804206
|
13 |
-
"micro_mean_score": 0.6801968019680197
|
14 |
},
|
15 |
"overall_score": 0.5672657028463584
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
+
"macro_mean_score": 0.5529953662872719
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 42,
|
10 |
"num_eval_samples": 813,
|
11 |
+
"macro_mean_score": 0.6600228904804206
|
|
|
12 |
},
|
13 |
"overall_score": 0.5672657028463584
|
14 |
},
|
static/eval_results/SI/GPT_4o_mini/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
-
"macro_mean_score": 0.4431039098921726
|
7 |
-
"micro_mean_score": 0.43780369290573373
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 42,
|
11 |
"num_eval_samples": 813,
|
12 |
-
"macro_mean_score": 0.595574663769726
|
13 |
-
"micro_mean_score": 0.6334563345633456
|
14 |
},
|
15 |
"overall_score": 0.46343334374251305
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
+
"macro_mean_score": 0.4431039098921726
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 42,
|
10 |
"num_eval_samples": 813,
|
11 |
+
"macro_mean_score": 0.595574663769726
|
|
|
12 |
},
|
13 |
"overall_score": 0.46343334374251305
|
14 |
},
|
static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
-
"macro_mean_score": 0.43481964330318734
|
7 |
-
"micro_mean_score": 0.4297862001943635
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 42,
|
11 |
"num_eval_samples": 813,
|
12 |
-
"macro_mean_score": 0.5787083135236054
|
13 |
-
"micro_mean_score": 0.6186961869618696
|
14 |
},
|
15 |
"overall_score": 0.4540047993325765
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
+
"macro_mean_score": 0.43481964330318734
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 42,
|
10 |
"num_eval_samples": 813,
|
11 |
+
"macro_mean_score": 0.5787083135236054
|
|
|
12 |
},
|
13 |
"overall_score": 0.4540047993325765
|
14 |
},
|
static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json
CHANGED
@@ -3,14 +3,12 @@
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
-
"macro_mean_score": 0.4914311038229404
|
7 |
-
"micro_mean_score": 0.48323615160349853
|
8 |
},
|
9 |
"open": {
|
10 |
"num_eval_tasks": 42,
|
11 |
"num_eval_samples": 813,
|
12 |
-
"macro_mean_score": 0.5814975405131552
|
13 |
-
"micro_mean_score": 0.6174661746617466
|
14 |
},
|
15 |
"overall_score": 0.5034399620483024
|
16 |
},
|
|
|
3 |
"core": {
|
4 |
"num_eval_tasks": 273,
|
5 |
"num_eval_samples": 4116,
|
6 |
+
"macro_mean_score": 0.4914311038229404
|
|
|
7 |
},
|
8 |
"open": {
|
9 |
"num_eval_tasks": 42,
|
10 |
"num_eval_samples": 813,
|
11 |
+
"macro_mean_score": 0.5814975405131552
|
|
|
12 |
},
|
13 |
"overall_score": 0.5034399620483024
|
14 |
},
|
static/eval_results/SI/Idefics3/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.08941182847569326,
|
8 |
-
"micro_mean_score": 0.08779475233900695,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.3231434267517844,
|
16 |
-
"micro_mean_score": 0.3618081180811809,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.12057604157917208
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.08941182847569326,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.3231434267517844,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.12057604157917208
|
static/eval_results/SI/InternVL2_2B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.12069001041308772,
|
8 |
-
"micro_mean_score": 0.11842605219090299,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.28522459992910454,
|
16 |
-
"micro_mean_score": 0.28886838868388687,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.14262795568189
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.12069001041308772,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.28522459992910454,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.14262795568189
|
static/eval_results/SI/InternVL2_76B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.3998616568018755,
|
8 |
-
"micro_mean_score": 0.39149064302628933,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.554748737158244,
|
16 |
-
"micro_mean_score": 0.5800738007380073,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.42051326751605805
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.3998616568018755,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.554748737158244,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.42051326751605805
|
static/eval_results/SI/InternVL2_8B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.27650612401825575,
|
8 |
-
"micro_mean_score": 0.27119471729837735,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.39388373890935635,
|
16 |
-
"micro_mean_score": 0.4045510455104551,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.29215647267040246
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.27650612401825575,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.39388373890935635,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.29215647267040246
|
static/eval_results/SI/Llama_3_2_11B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.20789144960796493,
|
8 |
-
"micro_mean_score": 0.20163641703273802,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.3861125858565788,
|
16 |
-
"micro_mean_score": 0.4130381303813038,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.2316542677744468
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.20789144960796493,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.3861125858565788,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.2316542677744468
|
static/eval_results/SI/MiniCPM_v2.6/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.23230765810722817,
|
8 |
-
"micro_mean_score": 0.22684118052665975,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.4360655066213874,
|
16 |
-
"micro_mean_score": 0.4588560885608856,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.2594753712424494
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.23230765810722817,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.4360655066213874,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.2594753712424494
|
static/eval_results/SI/Molmo_72B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4073,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.36480000609384927,
|
8 |
-
"micro_mean_score": 0.36205779758110807,
|
9 |
"missing_tasks": [
|
10 |
"planning_screenshot_termes",
|
11 |
"table_understanding",
|
@@ -17,7 +16,6 @@
|
|
17 |
"num_eval_samples": 813,
|
18 |
"num_not_eval_samples": 0,
|
19 |
"macro_mean_score": 0.4465682063915481,
|
20 |
-
"micro_mean_score": 0.4850553505535054,
|
21 |
"missing_tasks": []
|
22 |
},
|
23 |
"overall_score": 0.3758072638262318
|
|
|
5 |
"num_eval_samples": 4073,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.36480000609384927,
|
|
|
8 |
"missing_tasks": [
|
9 |
"planning_screenshot_termes",
|
10 |
"table_understanding",
|
|
|
16 |
"num_eval_samples": 813,
|
17 |
"num_not_eval_samples": 0,
|
18 |
"macro_mean_score": 0.4465682063915481,
|
|
|
19 |
"missing_tasks": []
|
20 |
},
|
21 |
"overall_score": 0.3758072638262318
|
static/eval_results/SI/Molmo_7B_D/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4102,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.2098088446992518,
|
8 |
-
"micro_mean_score": 0.20550929661464645,
|
9 |
"missing_tasks": [
|
10 |
"MMSoc_Misinformation_PolitiFact"
|
11 |
]
|
@@ -15,7 +14,6 @@
|
|
15 |
"num_eval_samples": 813,
|
16 |
"num_not_eval_samples": 0,
|
17 |
"macro_mean_score": 0.35697926179118733,
|
18 |
-
"micro_mean_score": 0.38936039360393604,
|
19 |
"missing_tasks": []
|
20 |
},
|
21 |
"overall_score": 0.22949405972428777
|
|
|
5 |
"num_eval_samples": 4102,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.2098088446992518,
|
|
|
8 |
"missing_tasks": [
|
9 |
"MMSoc_Misinformation_PolitiFact"
|
10 |
]
|
|
|
14 |
"num_eval_samples": 813,
|
15 |
"num_not_eval_samples": 0,
|
16 |
"macro_mean_score": 0.35697926179118733,
|
|
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.22949405972428777
|
static/eval_results/SI/NVLM/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.32989872890926025,
|
8 |
-
"micro_mean_score": 0.32315683713111915,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.4469349818134809,
|
16 |
-
"micro_mean_score": 0.4881303813038132,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.34550356262982296
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.32989872890926025,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.4469349818134809,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.34550356262982296
|
static/eval_results/SI/POINTS_15_7B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.31355970638319003,
|
8 |
-
"micro_mean_score": 0.30728203432446294,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.41331219301389166,
|
16 |
-
"micro_mean_score": 0.42749077490774917,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.32686003793395024
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.31355970638319003,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.41331219301389166,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.32686003793395024
|
static/eval_results/SI/POINTS_7B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.25511317681632334,
|
8 |
-
"micro_mean_score": 0.24927711632415062,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.30315625179016,
|
16 |
-
"micro_mean_score": 0.3313653136531366,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.26151892014616823
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.25511317681632334,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.30315625179016,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.26151892014616823
|
static/eval_results/SI/Phi-3.5-vision/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.2561274958722834,
|
8 |
-
"micro_mean_score": 0.2504214576875906,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.4272267419054576,
|
16 |
-
"micro_mean_score": 0.445879458794588,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.2789407286767066
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.2561274958722834,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.4272267419054576,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.2789407286767066
|
static/eval_results/SI/Pixtral_12B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.3436942439614412,
|
8 |
-
"micro_mean_score": 0.3373564384613738,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.4417271955536318,
|
16 |
-
"micro_mean_score": 0.4845633456334564,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.3567653041737333
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.3436942439614412,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.4417271955536318,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.3567653041737333
|
static/eval_results/SI/Qwen2_VL_2B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.22787906973244856,
|
8 |
-
"micro_mean_score": 0.2234748515064842,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.3509364634962041,
|
16 |
-
"micro_mean_score": 0.3768757687576875,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.24428672223428263
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.22787906973244856,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.3509364634962041,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.24428672223428263
|
static/eval_results/SI/Qwen2_VL_72B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.4730536307784527,
|
8 |
-
"micro_mean_score": 0.4659830915476831,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.5510079982505317,
|
16 |
-
"micro_mean_score": 0.5826568265682657,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.48344754644139654
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.4730536307784527,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.5510079982505317,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.48344754644139654
|
static/eval_results/SI/Qwen2_VL_7B/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.3538656561495699,
|
8 |
-
"micro_mean_score": 0.34581250459157137,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.4517429592549692,
|
16 |
-
"micro_mean_score": 0.4730012300123002,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.3669159632302898
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.3538656561495699,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.4517429592549692,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.3669159632302898
|
static/eval_results/SI/SmolVLM/summary_results.json
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.07348385181460795,
|
8 |
-
"micro_mean_score": 0.0732694668402814,
|
9 |
"missing_tasks": []
|
10 |
},
|
11 |
"open": {
|
@@ -13,7 +12,6 @@
|
|
13 |
"num_eval_samples": 813,
|
14 |
"num_not_eval_samples": 0,
|
15 |
"macro_mean_score": 0.2427337975725658,
|
16 |
-
"micro_mean_score": 0.2504920049200492,
|
17 |
"missing_tasks": []
|
18 |
},
|
19 |
"overall_score": 0.09605051124900234
|
|
|
5 |
"num_eval_samples": 4116,
|
6 |
"num_not_eval_samples": 0,
|
7 |
"macro_mean_score": 0.07348385181460795,
|
|
|
8 |
"missing_tasks": []
|
9 |
},
|
10 |
"open": {
|
|
|
12 |
"num_eval_samples": 813,
|
13 |
"num_not_eval_samples": 0,
|
14 |
"macro_mean_score": 0.2427337975725658,
|
|
|
15 |
"missing_tasks": []
|
16 |
},
|
17 |
"overall_score": 0.09605051124900234
|