diff --git "a/static/eval_results/SI/Ivy_VL_3B/task_results.json" "b/static/eval_results/SI/Ivy_VL_3B/task_results.json" new file mode 100644--- /dev/null +++ "b/static/eval_results/SI/Ivy_VL_3B/task_results.json" @@ -0,0 +1,4818 @@ +[ + { + "name": "graph_maxflow", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.07938039122017577, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Spatial and Temporal Reasoning" + ], + "input_format": "3D Models and Aerial Imagery", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "table_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.15555555555555553, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "coco_person_detection", + "score": 0.11976895147024216, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "game_info_parsing", + "score": 0.3181818181818182, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.16335281233361162, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.02222222222222222, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "tqa_textbook_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.25257142857142856, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.631578947368421, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_planar", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "stock_price_future_prediction", + "score": 0.24357142857142852, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "iconqa", + "score": 0.3684210526315789, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_biology", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "code_programming_test_easy", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "license_plate_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "math_convexity_value_estimation", + "score": 0.3473166123911397, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_analytic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "famous_building_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_transformation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "human_relationship_reasoning", + "score": 0.375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Commonsense and Social Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "algebra", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "single_person_pose_estimation", + "score": 0.11257002137534133, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "funsd_document_qa", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "movie_info_parsing", + "score": 0.15178571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "science_basic_physics", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_theory", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.04657142857142861, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "chart_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "quizlet_question_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "math_breakpoint", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "animal_pose_estimation", + "score": 0.2146751035113978, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "long_string_number_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.4662105263157896, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "face_keypoint_detection", + "score": 0.848854419078294, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_descriptive", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "science_molecule_chemistry", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.13392857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_chordless_cycle", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.44949999999999996, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "humor_understand_caption_match", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "places365_scene_type_classification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_winner_identification", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_solid", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Spatial and Temporal Reasoning", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.39, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "vibe_eval_short_phrase", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_chemistry", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "insect_order_classification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_physics", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "dvqa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.4164684210526316, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "super_clevr", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "figureqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.07466666666666663, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "math_parity", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_isomorphism", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "snli_ve_visual_entailment", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "stock_info_parsing", + "score": 0.05882352941176471, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "physical_property_reasoning", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "image_style_recognition", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_length", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.13265306122448978, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "youtube_video_info_parsing", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "map_diagram_qa", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "weather_info_parsing", + "score": 0.19841269841269837, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "graph_connectivity", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ti_fused_vqa_math", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "signboard_identification", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "TV_show_info_parsing", + "score": 0.11904761904761907, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "music_info_parsing", + "score": 0.11607142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_cell_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.32142857142857145, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MMMU_physics_chemistry_MCQ", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.12121212121212122, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "play_go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "knowledge_graph_understanding", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.061224489795918366, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "Ad_count_detection", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "music_sheet_format_QA", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "signage_navigation", + "score": 0.4, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "position_relationship", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "web_action_grounding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.17857142857142858, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "location_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ascii_art_understanding", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "CLEVRER_physics", + "score": 0.35, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "web_action_prediction", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "number_comparison", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "webpage_code_understanding", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "product_ocr_qa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "extract_webpage_headline", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chinese_idiom_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_depth_of_different_points", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Science", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.21605811930298385, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.08163265306122448, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09455782312925169, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "orchestra_score_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "cultural_vqa", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "monthly_weather_days_count", + "score": 0.2380952380952381, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "Bongard_Problem", + "score": 0.10526315789473684, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "game_platform_support_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Object Recognition and Classification" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_grid", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ishihara_test", + "score": 0.18571428571428572, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "paper_vqa", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "transit_map_intersection_points", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "top_video_creator_identification", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "realworld_qa_en2cn", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "rebus", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_grippers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "font_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Mathematics", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "entertainment_web_game_style", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "mensa_iq_test", + "score": 0.08333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "image_translation_en2cn", + "score": 0.09101923578817367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "code_execution", + "score": 0.0625, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "hashtag_recommendation", + "score": 0.65, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "LaTeX_complex_formula_convertion", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "music_sheet_note_count", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "counting", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "llavaguard", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "circuit_diagram_understanding", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "media_QA_web_stackoverflow", + "score": 0.4523809523809524, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "chess_find_legal_moves", + "score": 0.03355324641748354, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "mindmap_elements_parsing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "flowchart_code_generation", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Planning and Decision Making", + "Spatial and Temporal Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "arxiv_vqa", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "healthcare_info_judgement", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills", + "Ethical and Safety Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_asia", + "score": 0.042857142857142864, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_europe", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_africa", + "score": 0.014285714285714287, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "multiple_states_identify_americas", + "score": 0.05714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.05555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_depth", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_relation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_count", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "cvbench_adapted_cvbench_distance", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_TheoremQA", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_MATH", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "research_website_parsing_publication", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "reward_models_I2T_reward", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_haiku", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_limerick", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Language Understanding and Generation", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_school_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_math_text_latex", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_csv", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_math_equation", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_journal", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_latex", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_html", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_table_to_markdown", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "ocr_article_authors", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Information_Extraction", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_multi_question", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "panel_images_single_question", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_excel", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.058823529411764705, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_notes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_word", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "crossword_mini_5x5", + "score": 0.0071428571428571435, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Spatial and Temporal Reasoning", + "Planning and Decision Making", + "Mathematical and Logical Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_contain_length", + "score": 0.8, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_position_length", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "constrained_generation_contain_length", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_artifact_reason", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "autorater_artifact", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Metrics", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "shape_composition_colours", + "score": 0.06561791383219955, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "shape_composition_shapes", + "score": 0.13520408163265304, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_famous_treaty", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_indian_celebrity", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "memorization_papers", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Text Recognition (OCR)", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_swap", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_clip_stable_diffusion_generate", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_background_change", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_face_attribute_edit", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_veracity", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_entity_replace", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_text_style", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MFC_Bench_check_out_of_context", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_skribbl_io", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_doodle_guess", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "counterfactual_arithmetic", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Mathematics", + "output_format": "numerical_data", + "num_input": "1-image" + }, + { + "name": "table_understanding_fact_verification", + "score": 0.5238095238095238, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Mathematical and Logical Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "exact_text", + "num_input": "1-image" + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning", + "Text Recognition (OCR)" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation", + "Object Recognition and Classification", + "Scene and Event Understanding", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "multiple_choice", + "num_input": "1-image" + }, + { + "name": "MMSoc_Memotion", + "score": 0.611764705882353, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "ascii_art_30", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "contextual_formatted_text", + "num_input": "1-image" + }, + { + "name": "electrocardiogram", + "score": 0.1142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Text-Based Images and Documents", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "guess_image_generation_prompt", + "score": 0.4526315789473684, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table2latex_complex", + "score": 0.17777777777777778, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "defeasible_reasoning", + "score": 0.5586206896551725, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "meme_explain", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "funny_image_title", + "score": 0.39285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "vibe-eval", + "score": 0.31428571428571433, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.6, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "tweets_captioning", + "score": 0.2357142857142857, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "visualization_with_code", + "score": 0.0642857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Coding", + "output_format": "structured_output", + "num_input": "1-image" + }, + { + "name": "unusual_images", + "score": 0.3379310344827587, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Spatial and Temporal Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Easy", + "score": 0.6346153846153848, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bar_chart_interpretation", + "score": 0.35862068965517235, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "iq_test", + "score": 0.19655172413793104, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Mathematical and Logical Reasoning", + "Spatial and Temporal Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "figurative_speech_explanation", + "score": 0.6000000000000002, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Scene and Event Understanding", + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "docci_image_description_long", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "traffic_accident_analysis", + "score": 0.1285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.07200000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25, + "skills": [ + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Science", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "ocrqa", + "score": 0.4724137931034483, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Text-Based Images and Documents", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "image_humor_understanding", + "score": 0.406896551724138, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Commonsense and Social Reasoning", + "Scene and Event Understanding", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "graph_interpretation", + "score": 0.27586206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Mathematical and Logical Reasoning", + "Language Understanding and Generation", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "humor_explanation", + "score": 0.18666666666666668, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "GUI_Chat_Hard", + "score": 0.49999999999999994, + "eval_type": "llm", + "num_demo": 1, + "num_query": 31, + "skills": [ + "Object Recognition and Classification", + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Information_Extraction", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "science_figure_explanation", + "score": 0.3068965517241379, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Object Recognition and Classification", + "Scene and Event Understanding", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "art_explanation", + "score": 0.28965517241379307, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29, + "skills": [ + "Language Understanding and Generation", + "Scene and Event Understanding", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.02142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.014285714285714287, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.0, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.0071428571428571435, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Language Understanding and Generation", + "Commonsense and Social Reasoning" + ], + "input_format": "User Interface Screenshots", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "table_understanding_fetaqa", + "score": 0.1785714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Text Recognition (OCR)", + "Language Understanding and Generation" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Perception", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_captcha", + "score": 0.10000000000000003, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Text Recognition (OCR)" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_jailbreak", + "score": 0.635, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Text Recognition (OCR)", + "Ethical and Safety Reasoning" + ], + "input_format": "Text-Based Images and Documents", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_racial", + "score": 0.6499999999999998, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Ethical and Safety Reasoning", + "Scene and Event Understanding", + "Object Recognition and Classification" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_celebrity", + "score": 0.6150000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Commonsense and Social Reasoning", + "Language Understanding and Generation" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_politics", + "score": 0.58, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20, + "skills": [ + "Scene and Event Understanding", + "Commonsense and Social Reasoning", + "Ethical and Safety Reasoning" + ], + "input_format": "Photographs", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8631578947368422, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19, + "skills": [ + "Ethical and Safety Reasoning", + "Commonsense and Social Reasoning" + ], + "input_format": "Artistic and Creative Content", + "app": "Knowledge", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_advanced", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.02142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Planning and Decision Making", + "Mathematical and Logical Reasoning", + "Domain-Specific Knowledge and Skills" + ], + "input_format": "Diagrams and Data Visualizations", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + }, + { + "name": "bridge_strategies_expert", + "score": 0.38571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14, + "skills": [ + "Object Recognition and Classification", + "Spatial and Temporal Reasoning", + "Planning and Decision Making" + ], + "input_format": "User Interface Screenshots", + "app": "Planning", + "output_format": "open_ended_output", + "num_input": "1-image" + } +] \ No newline at end of file