|
|
|
from grader import is_equal |
|
import json |
|
import re |
|
|
|
|
|
def get_gold_list(datapath, dataset_name): |
|
|
|
assert dataset_name in ["gsm8k", "math", "minerva_math", "gaokao2023en", "olympiadbench", "collegemath"] |
|
|
|
gold_list = [] |
|
with open(datapath, "r") as f: |
|
for line in f: |
|
item = json.loads(line) |
|
|
|
if dataset_name == "gsm8k": |
|
gold = item['answer'].split("#### ")[-1] |
|
|
|
elif dataset_name == "math": |
|
gold = item['answer'] |
|
|
|
elif dataset_name == "minerva_math": |
|
pattern = r"\\boxed\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}" |
|
pattern_re = re.compile(pattern, re.DOTALL) |
|
solution = item['solution'] |
|
matches = pattern_re.findall(solution) |
|
if len(matches) == 0: |
|
gold = None |
|
else: |
|
gold = matches[-1] |
|
|
|
elif dataset_name == "gaokao2023en": |
|
gold = re.sub(r'^\$(.*)\$$', r'\1', item['answer']) |
|
|
|
elif dataset_name == "olympiadbench": |
|
gold = re.sub(r'^\$(.*)\$$', r'\1', item['final_answer'][0]) |
|
|
|
elif dataset_name == "collegemath": |
|
gold = re.sub(r'^\$(.*)\$$', r'\1', item['answer']) |
|
|
|
gold_list.append(gold) |
|
|
|
return gold_list |
|
|
|
|
|
def get_scores_on_math_benchmarks(model_output_path, test_gold_path, dataset_name): |
|
|
|
gold_list = get_gold_list(test_gold_path, dataset_name) |
|
|
|
"""TODO |
|
Get the output_list from model_output_path |
|
output_list is a list of string (List[str]) |
|
Each string represents the model's response for a corresponding question in the benchmark |
|
Therefore, the length of output_list must match the length of gold_list. |
|
|
|
output_list = ... |
|
""" |
|
|
|
correct = 0 |
|
for output, gold in zip(output_list, gold_list): |
|
if is_equal(output, gold, dataset_name): |
|
correct += 1 |
|
|
|
print("accuracy on %s is %.4f" % (dataset_name, correct / len(gold_list))) |
|
|
|
|
|
if __name__ == "__main__": |
|
"""TODO |
|
Download test benchmarks from Qwen2.5-Math |
|
https://github.com/QwenLM/Qwen2.5-Math/tree/main/evaluation/data |
|
|
|
Prepare model_output_path and test_gold_path for each dataset |
|
""" |
|
|
|
test_gold_path = "PATH_OF_THE_BENCHMARK" |
|
model_output_path = "PATH_OF_YOUR_MODEL_OUTPUTS" |
|
dataset_name = "DATASET_NAME" |
|
|
|
get_scores_on_math_benchmarks(model_output_path, test_gold_path, dataset_name) |
|
|