AgentVerse commited on
Commit
7569f5d
·
1 Parent(s): 670a607

first commit

Browse files
scripts/__init__.py ADDED
File without changes
scripts/evaluate_commongen.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import spacy
4
+ from tqdm import tqdm
5
+
6
+
7
+ nlp = spacy.load("en_core_web_sm")
8
+
9
+
10
+ def coverage_score(preds, concept_sets):
11
+ covs = []
12
+ missings = []
13
+ for p, cs in tqdm(zip(preds, concept_sets), total=len(preds)):
14
+ cs = set(cs)
15
+ lemmas = set()
16
+ for token in nlp(p):
17
+ lemmas.add(token.lemma_)
18
+ cov = len(lemmas & cs) / len(cs)
19
+ covs.append(cov)
20
+ missings.append(cs - lemmas)
21
+ return sum(covs) / len(covs), missings
22
+
23
+
24
+ def scoring(preds, concept_sets):
25
+ # Scores, Coverage, Coverage_POS = pivot_score.score(pred, ref, concept, ori_concepts, scoring="steiner_tree", parser="spacy", verbose=False)
26
+ coverage, missing_tokens = coverage_score(preds, concept_sets)
27
+ # print(f"System level Score: {sum(Scores)/len(Scores)*100:.2f}")
28
+ print(f"System level Coverage: {coverage*100:.2f}")
29
+ # print(f"System level Coverage_POS: {sum(Coverage_POS)/len(Scores)*100:.2f}")
30
+ return coverage, missing_tokens
31
+
32
+
33
+ if __name__ == "__main__":
34
+ parser = argparse.ArgumentParser()
35
+ parser.add_argument("--path", default="", type=str)
36
+ args = parser.parse_args()
37
+ # nlp.pipeline = [("tagger", nlp.tagger), ("parser", nlp.parser)]
38
+
39
+ preds_final = []
40
+ preds_first = []
41
+ concept_sets = []
42
+ with open(args.path) as f:
43
+ for line in f:
44
+ line = json.loads(line)
45
+ preds_final.append(line["response"])
46
+ if line["logs"][0]["module"] == "Role Assigner":
47
+ preds_first.append(line["logs"][1]["content"])
48
+ else:
49
+ preds_first.append(line["logs"][0]["content"])
50
+ concept_sets.append(line["input"])
51
+
52
+ scoring(preds_final, concept_sets)
53
+ scoring(preds_first, concept_sets)
scripts/evaluate_logic.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import subprocess
4
+ from importlib import reload
5
+ from argparse import ArgumentParser
6
+
7
+ parser = ArgumentParser()
8
+ parser.add_argument("--path", type=str, required=True)
9
+ parser.add_argument("--max_line", type=int, default=1000000000000)
10
+ args = parser.parse_args()
11
+
12
+
13
+ def check_corr(result: str, correct_solution: str, tol: float = 1e-3):
14
+ result = result.replace(",", "")
15
+ if result.strip() == correct_solution.strip():
16
+ return 1
17
+ try:
18
+ result = float(result.strip())
19
+ correct_solution = float(correct_solution.strip())
20
+ return abs(result - correct_solution) < tol
21
+ except:
22
+ return 0
23
+
24
+
25
+ final_accs = []
26
+ err_cnts = []
27
+ for i in range(2):
28
+ acc = 0
29
+ total = 0
30
+ err_cnt = 0
31
+ with open(args.path) as f:
32
+ for idx, line in enumerate(f):
33
+ if idx == args.max_line:
34
+ break
35
+ line = json.loads(line)
36
+ label = str(line["label"])
37
+ if i == 0:
38
+ response = line["response"]
39
+ else:
40
+ if line["logs"][0]["module"] == "Role Assigner":
41
+ response = line["logs"][1]["content"]
42
+ else:
43
+ response = line["logs"][0]["content"]
44
+ total += 1
45
+ result = re.findall(r"\\boxed\{(.+?)\}", response)
46
+ if len(result) == 0:
47
+ err_cnt += 1
48
+ # print(response)
49
+ continue
50
+ result = result[0]
51
+ result = re.sub(r"\\text\{.+\}?", "", result)
52
+ result = (
53
+ result.replace("rd", "")
54
+ .replace("nd", "")
55
+ .replace("st", "")
56
+ .replace("th", "")
57
+ .replace("House", "")
58
+ .replace("house", "")
59
+ .replace("\\", "")
60
+ )
61
+
62
+ # acc += check_corr(result, label)
63
+ try:
64
+ acc += int(result) == int(label)
65
+ except:
66
+ print(result)
67
+
68
+ final_accs.append(acc / total)
69
+ err_cnts.append(err_cnt)
70
+ print(final_accs)
71
+ print(err_cnts)
scripts/evaluate_math.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import subprocess
4
+ from importlib import reload
5
+ from argparse import ArgumentParser
6
+
7
+ parser = ArgumentParser()
8
+ parser.add_argument("--path", type=str, required=True)
9
+ parser.add_argument("--max_line", type=int, default=1000000000000)
10
+ parser.add_argument("--ci_smoke_test", action="store_true")
11
+ args = parser.parse_args()
12
+
13
+
14
+ def check_corr(result: str, correct_solution: str, tol: float = 1e-3):
15
+ result = result.replace(",", "")
16
+ if result.strip() == correct_solution.strip():
17
+ return 1
18
+ try:
19
+ result = float(result.strip())
20
+ correct_solution = float(correct_solution.strip())
21
+ return abs(result - correct_solution) < tol
22
+ except:
23
+ return 0
24
+
25
+
26
+ # final_accs = []
27
+ # for i in range(2):
28
+ # acc = 0
29
+ # total = 0
30
+ # with open(args.path) as f:
31
+ # for line in f:
32
+ # line = json.loads(line)
33
+ # label = str(line["label"])
34
+ # if i == 0:
35
+ # code = line["response"]
36
+ # else:
37
+ # code = line["logs"][0]["content"]
38
+ # total += 1
39
+ # code = code.strip().replace("```", "")
40
+ # code = code.lstrip("python3")
41
+ # code = code.lstrip("python")
42
+ # with open("tmp.py", "w") as f:
43
+ # f.write(code)
44
+
45
+ # try:
46
+ # import tmp
47
+
48
+ # reload(tmp)
49
+ # result = str(tmp.solution())
50
+ # is_corr = check_corr(result, label)
51
+
52
+ # is_corr = int(is_corr)
53
+ # # Step 2
54
+ # if is_corr:
55
+ # acc += 1
56
+ # except:
57
+ # print(code)
58
+ # final_accs.append(acc / total)
59
+ # print(final_accs)
60
+
61
+ final_accs = []
62
+ err_cnts = []
63
+ for i in range(2):
64
+ acc = 0
65
+ total = 0
66
+ err_cnt = 0
67
+ with open(args.path) as f:
68
+ for idx, line in enumerate(f):
69
+ if idx == args.max_line:
70
+ break
71
+ line = json.loads(line)
72
+ label = str(line["label"])
73
+ if i == 0:
74
+ response = line["response"]
75
+ else:
76
+ if line["logs"][0]["module"] == "Role Assigner":
77
+ response = line["logs"][1]["content"]
78
+ else:
79
+ response = line["logs"][0]["content"]
80
+ total += 1
81
+ result = re.findall(r"\\boxed\{(.+?)\}", response)
82
+ if len(result) == 0:
83
+ err_cnt += 1
84
+ print(response)
85
+ continue
86
+ result = result[0]
87
+ acc += check_corr(result, label)
88
+ final_accs.append(acc / total)
89
+ err_cnts.append(err_cnt)
90
+ print(final_accs)
91
+ print(err_cnts)
92
+ if args.ci_smoke_test is True:
93
+ assert final_accs[0] == 1.0
scripts/evaluate_responsegen.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from string import Template
4
+ import time
5
+ import openai
6
+ from tqdm import tqdm
7
+
8
+ with open("./results.jsonl", "r") as f:
9
+ lines = list(f.readlines())
10
+
11
+ eval_prompt = r"""Which response is better given this context:
12
+ ${context}
13
+
14
+ Response A: ${response_a}
15
+
16
+ Response B: ${response_b}.
17
+
18
+ Pick your answer from ['Response A', 'Response B', 'both', 'neither']. Generate a short explanation for your choice first. Then, generate 'The better response is A' or 'The better response is B' or 'The better response is both' or 'The better response is neither'.
19
+
20
+ Your response format should be:
21
+ Explanation: <explanation>
22
+ Answer: ('The better response is A' or 'The better response is B' or 'The better response is both' or 'The better response is neither')
23
+ """
24
+
25
+ res = []
26
+ eval = []
27
+
28
+
29
+ def write_eval_to_file(file, skip=0):
30
+ for idx, line in tqdm(enumerate(lines)):
31
+ if idx < skip:
32
+ continue
33
+ data = json.loads(line)
34
+ # print(idx + 1)
35
+ context = data["input"]
36
+ response_a = data["response"]
37
+ response_b = data["label"]
38
+
39
+ context_quote = "> " + "\n> ".join(context.split("\n"))
40
+ response_a_quote = "> " + "\n> ".join(response_a.split("\n"))
41
+ response_b_quote = "> " + "\n> ".join(response_b.split("\n"))
42
+
43
+ f.write(f"## {idx + 1}\n\n")
44
+ f.write(f"Context:\n" f"{context_quote}\n\n")
45
+ f.write(f"Response A (pipeline):\n" f"{response_a_quote}\n\n")
46
+ f.write(f"Response B (init):\n" f"{response_b_quote}\n\n")
47
+
48
+ prompt = Template(eval_prompt).safe_substitute(
49
+ context=context, response_a=response_a, response_b=response_b
50
+ )
51
+ for i in range(100):
52
+ try:
53
+ eval_response = openai.ChatCompletion.create(
54
+ model="gpt-4",
55
+ messages=[{"role": "user", "content": prompt}],
56
+ temperature=0.0,
57
+ )
58
+ except:
59
+ time.sleep(min(i**2, 60))
60
+ continue
61
+ break
62
+ text = eval_response["choices"][0]["message"]["content"]
63
+ eval.append(text)
64
+ text = text.replace("\n", "\n\n")
65
+ f.write(f"{text}\n\n")
66
+
67
+ if "The better response is A" in text:
68
+ res.append("A")
69
+ elif "The better response is B" in text:
70
+ res.append("B")
71
+ elif "The better response is both" in text:
72
+ res.append("both")
73
+ elif "The better response is neither" in text:
74
+ res.append("neither")
75
+ else:
76
+ res.append("unknown")
77
+
78
+
79
+ if not os.path.exists("./eval.md"):
80
+ with open("./eval.md", "w") as f:
81
+ f.write("# ResponseGen Eval\n\n")
82
+ write_eval_to_file(f)
83
+ win_cnt = 0
84
+ for r in res:
85
+ if r == "A":
86
+ win_cnt += 1
87
+ print(f"win rate: {win_cnt / len(res)}")
88
+ else:
89
+ win_cnt = 0
90
+ total_cnt = 0
91
+ with open("./eval.md", "r") as f:
92
+ for line in f:
93
+ if line.startswith("Answer"):
94
+ total_cnt += 1
95
+ if "The better response is A" in line:
96
+ res.append("A")
97
+ elif "The better response is B" in line:
98
+ res.append("B")
99
+ elif "The better response is both" in line:
100
+ res.append("both")
101
+ elif "The better response is neither" in line:
102
+ res.append("neither")
103
+ else:
104
+ res.append("unknown")
105
+ with open("./eval.md", "a") as f:
106
+ f.write("\n")
107
+ write_eval_to_file(f, total_cnt)
108
+ win_cnt = 0
109
+ for r in res:
110
+ if r == "A":
111
+ win_cnt += 1
112
+ print(f"win rate: {win_cnt / len(res)}")