dataset,version,metric,mode,internvl-chat-20b mmlu,-,naive_average,gen,46.35 mmlu_pro,-,-,-,- cmmlu,-,naive_average,gen,47.13 ceval,-,naive_average,gen,48.56 agieval,-,-,-,- GaokaoBench,-,weighted_average,gen,32.28 GPQA_extended,-,-,-,- GPQA_main,-,-,-,- GPQA_diamond,-,-,-,- ARC-c,-,-,-,- truthfulqa,-,-,-,- triviaqa,2121ce,score,gen,31.47 triviaqa_wiki_1shot,-,-,-,- nq,3dcea1,score,gen,13.21 C3,8c358f,accuracy,gen,76.88 race-high,9a54b6,accuracy,gen,72.56 flores_100,-,-,-,- winogrande,b36770,accuracy,gen,58.72 hellaswag,e42710,accuracy,gen,53.69 bbh,-,naive_average,gen,36.32 gsm8k,1d7fe4,accuracy,gen,40.71 math,393424,accuracy,gen,6.96 TheoremQA,6f0af8,score,gen,12.25 MathBench,-,-,-,- openai_humaneval,8e312c,humaneval_pass@1,gen,32.32 humaneval_plus,-,-,-,- humanevalx,-,-,-,- sanitized_mbpp,a447ff,score,gen,33.07 mbpp_plus,-,-,-,- mbpp_cn,6fb572,score,gen,23.40 leval,-,-,-,- leval_closed,-,-,-,- leval_open,-,-,-,- longbench,-,-,-,- longbench_single-document-qa,-,-,-,- longbench_multi-document-qa,-,-,-,- longbench_summarization,-,-,-,- longbench_few-shot-learning,-,-,-,- longbench_synthetic-tasks,-,-,-,- longbench_code-completion,-,-,-,- teval,-,-,-,- teval_zh,-,-,-,- IFEval,3321a3,Prompt-level-strict-accuracy,gen,19.78 IFEval,3321a3,Inst-level-strict-accuracy,gen,31.89 IFEval,3321a3,Prompt-level-loose-accuracy,gen,22.92 IFEval,3321a3,Inst-level-loose-accuracy,gen,35.13