cccjc commited on
Commit
665c137
·
1 Parent(s): 61704fb

update Ivy_VL_3B

Browse files
constants.py CHANGED
@@ -132,6 +132,7 @@ MODEL_NAME_MAP = {
132
  "Gemini-2.0-thinking": "Gemini-2.0-Flash-thinking",
133
  "Gemini-Flash-2.0-exp": "Gemini-2.0-Flash-exp",
134
  "Gemini-exp-1206": "Gemini-exp-1206",
 
135
  }
136
 
137
  DIMENSION_NAME_MAP = {
@@ -223,15 +224,16 @@ MODEL_URLS = {
223
  "Gemini-2.0-thinking": "https://ai.google.dev/gemini-api/docs/thinking-mode",
224
  "Gemini-exp-1206": "https://blog.google/feed/gemini-exp-1206/",
225
  "Gemini-Flash-2.0-exp": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/#gemini-2-0-flash",
 
226
  }
227
 
228
  # Define the base MODEL_GROUPS structure
229
  BASE_MODEL_GROUPS = {
230
  "All": list(MODEL_NAME_MAP.keys()),
231
  "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B', 'Grok-2-vision-1212', "Gemini-exp-1206"],
232
- "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Gemini-2.0-thinking", "Gemini-Flash-2.0-exp"],
233
  "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Grok-2-vision-1212', "Gemini-exp-1206"],
234
  "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', "Gemini-Flash-2.0-exp", "Gemini-2.0-thinking"],
235
  "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"],
236
- "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"]
237
  }
 
132
  "Gemini-2.0-thinking": "Gemini-2.0-Flash-thinking",
133
  "Gemini-Flash-2.0-exp": "Gemini-2.0-Flash-exp",
134
  "Gemini-exp-1206": "Gemini-exp-1206",
135
+ "Ivy_VL_3B": "Ivy-VL-3B",
136
  }
137
 
138
  DIMENSION_NAME_MAP = {
 
224
  "Gemini-2.0-thinking": "https://ai.google.dev/gemini-api/docs/thinking-mode",
225
  "Gemini-exp-1206": "https://blog.google/feed/gemini-exp-1206/",
226
  "Gemini-Flash-2.0-exp": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/#gemini-2-0-flash",
227
+ "Ivy_VL_3B": "https://huggingface.co/AI-Safeguard/Ivy-VL-llava",
228
  }
229
 
230
  # Define the base MODEL_GROUPS structure
231
  BASE_MODEL_GROUPS = {
232
  "All": list(MODEL_NAME_MAP.keys()),
233
  "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B', 'Grok-2-vision-1212', "Gemini-exp-1206"],
234
+ "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Gemini-2.0-thinking", "Gemini-Flash-2.0-exp", "Ivy_VL_3B"],
235
  "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Grok-2-vision-1212', "Gemini-exp-1206"],
236
  "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', "Gemini-Flash-2.0-exp", "Gemini-2.0-thinking"],
237
  "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"],
238
+ "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B", "Ivy_VL_3B"]
239
  }
static/eval_results/Default/Ivy_VL_3B/summary_and_keyword_stats.json ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.19185564187617368
7
+ },
8
+ "open": {
9
+ "num_eval_tasks": 65,
10
+ "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.27715328322847527
12
+ },
13
+ "overall_score": 0.20283454620864816
14
+ },
15
+ "keyword_stats": {
16
+ "skills": {
17
+ "Object Recognition and Classification": {
18
+ "count": 303,
19
+ "num_samples": 4755,
20
+ "tasks": [],
21
+ "average_score": 0.211403742529065
22
+ },
23
+ "Text Recognition (OCR)": {
24
+ "count": 137,
25
+ "num_samples": 2239,
26
+ "tasks": [],
27
+ "average_score": 0.16550807652825617
28
+ },
29
+ "Language Understanding and Generation": {
30
+ "count": 154,
31
+ "num_samples": 2509,
32
+ "tasks": [],
33
+ "average_score": 0.20807789378302347
34
+ },
35
+ "Scene and Event Understanding": {
36
+ "count": 154,
37
+ "num_samples": 2467,
38
+ "tasks": [],
39
+ "average_score": 0.26453498820602844
40
+ },
41
+ "Mathematical and Logical Reasoning": {
42
+ "count": 109,
43
+ "num_samples": 1910,
44
+ "tasks": [],
45
+ "average_score": 0.1880366735477861
46
+ },
47
+ "Commonsense and Social Reasoning": {
48
+ "count": 51,
49
+ "num_samples": 855,
50
+ "tasks": [],
51
+ "average_score": 0.30546713625131916
52
+ },
53
+ "Ethical and Safety Reasoning": {
54
+ "count": 15,
55
+ "num_samples": 245,
56
+ "tasks": [],
57
+ "average_score": 0.4761629072681704
58
+ },
59
+ "Domain-Specific Knowledge and Skills": {
60
+ "count": 77,
61
+ "num_samples": 1386,
62
+ "tasks": [],
63
+ "average_score": 0.19519503635292754
64
+ },
65
+ "Spatial and Temporal Reasoning": {
66
+ "count": 152,
67
+ "num_samples": 2437,
68
+ "tasks": [],
69
+ "average_score": 0.19092980712966892
70
+ },
71
+ "Planning and Decision Making": {
72
+ "count": 37,
73
+ "num_samples": 577,
74
+ "tasks": [],
75
+ "average_score": 0.0388946980911512
76
+ }
77
+ },
78
+ "input_format": {
79
+ "User Interface Screenshots": {
80
+ "count": 93,
81
+ "num_samples": 1517,
82
+ "tasks": [],
83
+ "average_score": 0.10493661164954043
84
+ },
85
+ "Text-Based Images and Documents": {
86
+ "count": 82,
87
+ "num_samples": 1294,
88
+ "tasks": [],
89
+ "average_score": 0.1249415479767782
90
+ },
91
+ "Diagrams and Data Visualizations": {
92
+ "count": 101,
93
+ "num_samples": 1718,
94
+ "tasks": [],
95
+ "average_score": 0.22725684171603053
96
+ },
97
+ "Videos": {
98
+ "count": 43,
99
+ "num_samples": 698,
100
+ "tasks": [],
101
+ "average_score": 0.2536386994618329
102
+ },
103
+ "Artistic and Creative Content": {
104
+ "count": 32,
105
+ "num_samples": 541,
106
+ "tasks": [],
107
+ "average_score": 0.2524995045658917
108
+ },
109
+ "Photographs": {
110
+ "count": 143,
111
+ "num_samples": 2248,
112
+ "tasks": [],
113
+ "average_score": 0.27729106359048106
114
+ },
115
+ "3D Models and Aerial Imagery": {
116
+ "count": 11,
117
+ "num_samples": 169,
118
+ "tasks": [],
119
+ "average_score": 0.07592024437626757
120
+ }
121
+ },
122
+ "output_format": {
123
+ "contextual_formatted_text": {
124
+ "count": 98,
125
+ "num_samples": 1514,
126
+ "tasks": [],
127
+ "average_score": 0.1701675351847416
128
+ },
129
+ "structured_output": {
130
+ "count": 110,
131
+ "num_samples": 1714,
132
+ "tasks": [],
133
+ "average_score": 0.13008026550344784
134
+ },
135
+ "exact_text": {
136
+ "count": 83,
137
+ "num_samples": 1278,
138
+ "tasks": [],
139
+ "average_score": 0.2158637762929687
140
+ },
141
+ "numerical_data": {
142
+ "count": 49,
143
+ "num_samples": 862,
144
+ "tasks": [],
145
+ "average_score": 0.20877347808029043
146
+ },
147
+ "open_ended_output": {
148
+ "count": 80,
149
+ "num_samples": 1454,
150
+ "tasks": [],
151
+ "average_score": 0.26632791563900915
152
+ },
153
+ "multiple_choice": {
154
+ "count": 85,
155
+ "num_samples": 1363,
156
+ "tasks": [],
157
+ "average_score": 0.25874554202955213
158
+ }
159
+ },
160
+ "input_num": {
161
+ "6-8 images": {
162
+ "count": 21,
163
+ "num_samples": 314,
164
+ "tasks": [],
165
+ "average_score": 0.13869047619047617
166
+ },
167
+ "9-image or more": {
168
+ "count": 41,
169
+ "num_samples": 623,
170
+ "tasks": [],
171
+ "average_score": 0.11042798544340451
172
+ },
173
+ "1-image": {
174
+ "count": 315,
175
+ "num_samples": 5228,
176
+ "tasks": [],
177
+ "average_score": 0.2052404551683576
178
+ },
179
+ "video": {
180
+ "count": 43,
181
+ "num_samples": 698,
182
+ "tasks": [],
183
+ "average_score": 0.2536386994618329
184
+ },
185
+ "4-5 images": {
186
+ "count": 34,
187
+ "num_samples": 520,
188
+ "tasks": [],
189
+ "average_score": 0.1866009831347769
190
+ },
191
+ "2-3 images": {
192
+ "count": 51,
193
+ "num_samples": 802,
194
+ "tasks": [],
195
+ "average_score": 0.2566619127590936
196
+ }
197
+ },
198
+ "app": {
199
+ "Information_Extraction": {
200
+ "count": 72,
201
+ "num_samples": 1124,
202
+ "tasks": [],
203
+ "average_score": 0.1410253687207258
204
+ },
205
+ "Planning": {
206
+ "count": 78,
207
+ "num_samples": 1239,
208
+ "tasks": [],
209
+ "average_score": 0.0749166420062132
210
+ },
211
+ "Coding": {
212
+ "count": 31,
213
+ "num_samples": 474,
214
+ "tasks": [],
215
+ "average_score": 0.13665778139212675
216
+ },
217
+ "Perception": {
218
+ "count": 145,
219
+ "num_samples": 2313,
220
+ "tasks": [],
221
+ "average_score": 0.24948956246459913
222
+ },
223
+ "Metrics": {
224
+ "count": 20,
225
+ "num_samples": 309,
226
+ "tasks": [],
227
+ "average_score": 0.4214677030751524
228
+ },
229
+ "Science": {
230
+ "count": 29,
231
+ "num_samples": 574,
232
+ "tasks": [],
233
+ "average_score": 0.2205827555902744
234
+ },
235
+ "Knowledge": {
236
+ "count": 97,
237
+ "num_samples": 1605,
238
+ "tasks": [],
239
+ "average_score": 0.2480287030335926
240
+ },
241
+ "Mathematics": {
242
+ "count": 33,
243
+ "num_samples": 547,
244
+ "tasks": [],
245
+ "average_score": 0.21626379583600183
246
+ }
247
+ }
248
+ }
249
+ }
static/eval_results/Default/Ivy_VL_3B/task_results.json ADDED
The diff for this file is too large to render. See raw diff
 
static/eval_results/SI/Ivy_VL_3B/summary_and_keyword_stats.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 273,
5
+ "num_eval_samples": 4116,
6
+ "macro_mean_score": 0.19338729492202295
7
+ },
8
+ "open": {
9
+ "num_eval_tasks": 42,
10
+ "num_eval_samples": 813,
11
+ "macro_mean_score": 0.2822859967695332
12
+ },
13
+ "overall_score": 0.20524045516835765
14
+ },
15
+ "keyword_stats": {
16
+ "skills": {
17
+ "Text Recognition (OCR)": {
18
+ "count": 101,
19
+ "num_samples": 1687,
20
+ "tasks": [],
21
+ "average_score": 0.17399890430893702
22
+ },
23
+ "Object Recognition and Classification": {
24
+ "count": 172,
25
+ "num_samples": 2714,
26
+ "tasks": [],
27
+ "average_score": 0.21109372364521883
28
+ },
29
+ "Scene and Event Understanding": {
30
+ "count": 60,
31
+ "num_samples": 1004,
32
+ "tasks": [],
33
+ "average_score": 0.33373019769952694
34
+ },
35
+ "Mathematical and Logical Reasoning": {
36
+ "count": 91,
37
+ "num_samples": 1630,
38
+ "tasks": [],
39
+ "average_score": 0.19559519566671743
40
+ },
41
+ "Language Understanding and Generation": {
42
+ "count": 102,
43
+ "num_samples": 1713,
44
+ "tasks": [],
45
+ "average_score": 0.22630314325483508
46
+ },
47
+ "Domain-Specific Knowledge and Skills": {
48
+ "count": 46,
49
+ "num_samples": 897,
50
+ "tasks": [],
51
+ "average_score": 0.17591613987857024
52
+ },
53
+ "Spatial and Temporal Reasoning": {
54
+ "count": 78,
55
+ "num_samples": 1273,
56
+ "tasks": [],
57
+ "average_score": 0.1892291725086101
58
+ },
59
+ "Planning and Decision Making": {
60
+ "count": 23,
61
+ "num_samples": 356,
62
+ "tasks": [],
63
+ "average_score": 0.036703008210857355
64
+ },
65
+ "Commonsense and Social Reasoning": {
66
+ "count": 38,
67
+ "num_samples": 654,
68
+ "tasks": [],
69
+ "average_score": 0.33693960266311135
70
+ },
71
+ "Ethical and Safety Reasoning": {
72
+ "count": 10,
73
+ "num_samples": 170,
74
+ "tasks": [],
75
+ "average_score": 0.6292443609022556
76
+ }
77
+ },
78
+ "input_format": {
79
+ "User Interface Screenshots": {
80
+ "count": 67,
81
+ "num_samples": 1123,
82
+ "tasks": [],
83
+ "average_score": 0.10470899167807271
84
+ },
85
+ "Text-Based Images and Documents": {
86
+ "count": 53,
87
+ "num_samples": 847,
88
+ "tasks": [],
89
+ "average_score": 0.10952573461485345
90
+ },
91
+ "Photographs": {
92
+ "count": 83,
93
+ "num_samples": 1315,
94
+ "tasks": [],
95
+ "average_score": 0.3040415952138278
96
+ },
97
+ "Diagrams and Data Visualizations": {
98
+ "count": 88,
99
+ "num_samples": 1524,
100
+ "tasks": [],
101
+ "average_score": 0.2361422770272551
102
+ },
103
+ "Artistic and Creative Content": {
104
+ "count": 22,
105
+ "num_samples": 389,
106
+ "tasks": [],
107
+ "average_score": 0.2541893948086852
108
+ },
109
+ "3D Models and Aerial Imagery": {
110
+ "count": 2,
111
+ "num_samples": 30,
112
+ "tasks": [],
113
+ "average_score": 0.1111187670386593
114
+ }
115
+ },
116
+ "output_format": {
117
+ "structured_output": {
118
+ "count": 72,
119
+ "num_samples": 1121,
120
+ "tasks": [],
121
+ "average_score": 0.1448509560944764
122
+ },
123
+ "contextual_formatted_text": {
124
+ "count": 63,
125
+ "num_samples": 975,
126
+ "tasks": [],
127
+ "average_score": 0.17036364091761733
128
+ },
129
+ "exact_text": {
130
+ "count": 57,
131
+ "num_samples": 880,
132
+ "tasks": [],
133
+ "average_score": 0.15740808174047646
134
+ },
135
+ "numerical_data": {
136
+ "count": 39,
137
+ "num_samples": 694,
138
+ "tasks": [],
139
+ "average_score": 0.19788851141672356
140
+ },
141
+ "open_ended_output": {
142
+ "count": 51,
143
+ "num_samples": 991,
144
+ "tasks": [],
145
+ "average_score": 0.2682473537043838
146
+ },
147
+ "multiple_choice": {
148
+ "count": 33,
149
+ "num_samples": 567,
150
+ "tasks": [],
151
+ "average_score": 0.39751628842537934
152
+ }
153
+ },
154
+ "input_num": {
155
+ "1-image": {
156
+ "count": 315,
157
+ "num_samples": 5228,
158
+ "tasks": [],
159
+ "average_score": 0.2052404551683576
160
+ }
161
+ },
162
+ "app": {
163
+ "Information_Extraction": {
164
+ "count": 41,
165
+ "num_samples": 644,
166
+ "tasks": [],
167
+ "average_score": 0.10690741122489508
168
+ },
169
+ "Planning": {
170
+ "count": 44,
171
+ "num_samples": 714,
172
+ "tasks": [],
173
+ "average_score": 0.06933513581009754
174
+ },
175
+ "Coding": {
176
+ "count": 16,
177
+ "num_samples": 244,
178
+ "tasks": [],
179
+ "average_score": 0.18259439192343602
180
+ },
181
+ "Perception": {
182
+ "count": 82,
183
+ "num_samples": 1321,
184
+ "tasks": [],
185
+ "average_score": 0.2810337734703279
186
+ },
187
+ "Metrics": {
188
+ "count": 3,
189
+ "num_samples": 45,
190
+ "tasks": [],
191
+ "average_score": 0.36507936507936506
192
+ },
193
+ "Science": {
194
+ "count": 22,
195
+ "num_samples": 469,
196
+ "tasks": [],
197
+ "average_score": 0.2593828964382621
198
+ },
199
+ "Knowledge": {
200
+ "count": 77,
201
+ "num_samples": 1294,
202
+ "tasks": [],
203
+ "average_score": 0.23992559516244502
204
+ },
205
+ "Mathematics": {
206
+ "count": 30,
207
+ "num_samples": 497,
208
+ "tasks": [],
209
+ "average_score": 0.19915270674792282
210
+ }
211
+ }
212
+ }
213
+ }
static/eval_results/SI/Ivy_VL_3B/task_results.json ADDED
The diff for this file is too large to render. See raw diff