|
dataset,version,metric,mode,internvl-chat-20b |
|
mmlu,-,naive_average,gen,46.35 |
|
mmlu_pro,-,-,-,- |
|
cmmlu,-,naive_average,gen,47.13 |
|
ceval,-,naive_average,gen,48.56 |
|
agieval,-,-,-,- |
|
GaokaoBench,-,weighted_average,gen,32.28 |
|
GPQA_extended,-,-,-,- |
|
GPQA_main,-,-,-,- |
|
GPQA_diamond,-,-,-,- |
|
ARC-c,-,-,-,- |
|
truthfulqa,-,-,-,- |
|
triviaqa,2121ce,score,gen,31.47 |
|
triviaqa_wiki_1shot,-,-,-,- |
|
nq,3dcea1,score,gen,13.21 |
|
C3,8c358f,accuracy,gen,76.88 |
|
race-high,9a54b6,accuracy,gen,72.56 |
|
flores_100,-,-,-,- |
|
winogrande,b36770,accuracy,gen,58.72 |
|
hellaswag,e42710,accuracy,gen,53.69 |
|
bbh,-,naive_average,gen,36.32 |
|
gsm8k,1d7fe4,accuracy,gen,40.71 |
|
math,393424,accuracy,gen,6.96 |
|
TheoremQA,6f0af8,score,gen,12.25 |
|
MathBench,-,-,-,- |
|
openai_humaneval,8e312c,humaneval_pass@1,gen,32.32 |
|
humaneval_plus,-,-,-,- |
|
humanevalx,-,-,-,- |
|
sanitized_mbpp,a447ff,score,gen,33.07 |
|
mbpp_plus,-,-,-,- |
|
mbpp_cn,6fb572,score,gen,23.40 |
|
leval,-,-,-,- |
|
leval_closed,-,-,-,- |
|
leval_open,-,-,-,- |
|
longbench,-,-,-,- |
|
longbench_single-document-qa,-,-,-,- |
|
longbench_multi-document-qa,-,-,-,- |
|
longbench_summarization,-,-,-,- |
|
longbench_few-shot-learning,-,-,-,- |
|
longbench_synthetic-tasks,-,-,-,- |
|
longbench_code-completion,-,-,-,- |
|
teval,-,-,-,- |
|
teval_zh,-,-,-,- |
|
IFEval,3321a3,Prompt-level-strict-accuracy,gen,19.78 |
|
IFEval,3321a3,Inst-level-strict-accuracy,gen,31.89 |
|
IFEval,3321a3,Prompt-level-loose-accuracy,gen,22.92 |
|
IFEval,3321a3,Inst-level-loose-accuracy,gen,35.13 |
|
|