Spaces:
Build error
Build error
AgentVerse
commited on
Commit
·
7569f5d
1
Parent(s):
670a607
first commit
Browse files- scripts/__init__.py +0 -0
- scripts/evaluate_commongen.py +53 -0
- scripts/evaluate_logic.py +71 -0
- scripts/evaluate_math.py +93 -0
- scripts/evaluate_responsegen.py +112 -0
scripts/__init__.py
ADDED
File without changes
|
scripts/evaluate_commongen.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import spacy
|
4 |
+
from tqdm import tqdm
|
5 |
+
|
6 |
+
|
7 |
+
nlp = spacy.load("en_core_web_sm")
|
8 |
+
|
9 |
+
|
10 |
+
def coverage_score(preds, concept_sets):
|
11 |
+
covs = []
|
12 |
+
missings = []
|
13 |
+
for p, cs in tqdm(zip(preds, concept_sets), total=len(preds)):
|
14 |
+
cs = set(cs)
|
15 |
+
lemmas = set()
|
16 |
+
for token in nlp(p):
|
17 |
+
lemmas.add(token.lemma_)
|
18 |
+
cov = len(lemmas & cs) / len(cs)
|
19 |
+
covs.append(cov)
|
20 |
+
missings.append(cs - lemmas)
|
21 |
+
return sum(covs) / len(covs), missings
|
22 |
+
|
23 |
+
|
24 |
+
def scoring(preds, concept_sets):
|
25 |
+
# Scores, Coverage, Coverage_POS = pivot_score.score(pred, ref, concept, ori_concepts, scoring="steiner_tree", parser="spacy", verbose=False)
|
26 |
+
coverage, missing_tokens = coverage_score(preds, concept_sets)
|
27 |
+
# print(f"System level Score: {sum(Scores)/len(Scores)*100:.2f}")
|
28 |
+
print(f"System level Coverage: {coverage*100:.2f}")
|
29 |
+
# print(f"System level Coverage_POS: {sum(Coverage_POS)/len(Scores)*100:.2f}")
|
30 |
+
return coverage, missing_tokens
|
31 |
+
|
32 |
+
|
33 |
+
if __name__ == "__main__":
|
34 |
+
parser = argparse.ArgumentParser()
|
35 |
+
parser.add_argument("--path", default="", type=str)
|
36 |
+
args = parser.parse_args()
|
37 |
+
# nlp.pipeline = [("tagger", nlp.tagger), ("parser", nlp.parser)]
|
38 |
+
|
39 |
+
preds_final = []
|
40 |
+
preds_first = []
|
41 |
+
concept_sets = []
|
42 |
+
with open(args.path) as f:
|
43 |
+
for line in f:
|
44 |
+
line = json.loads(line)
|
45 |
+
preds_final.append(line["response"])
|
46 |
+
if line["logs"][0]["module"] == "Role Assigner":
|
47 |
+
preds_first.append(line["logs"][1]["content"])
|
48 |
+
else:
|
49 |
+
preds_first.append(line["logs"][0]["content"])
|
50 |
+
concept_sets.append(line["input"])
|
51 |
+
|
52 |
+
scoring(preds_final, concept_sets)
|
53 |
+
scoring(preds_first, concept_sets)
|
scripts/evaluate_logic.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
import subprocess
|
4 |
+
from importlib import reload
|
5 |
+
from argparse import ArgumentParser
|
6 |
+
|
7 |
+
parser = ArgumentParser()
|
8 |
+
parser.add_argument("--path", type=str, required=True)
|
9 |
+
parser.add_argument("--max_line", type=int, default=1000000000000)
|
10 |
+
args = parser.parse_args()
|
11 |
+
|
12 |
+
|
13 |
+
def check_corr(result: str, correct_solution: str, tol: float = 1e-3):
|
14 |
+
result = result.replace(",", "")
|
15 |
+
if result.strip() == correct_solution.strip():
|
16 |
+
return 1
|
17 |
+
try:
|
18 |
+
result = float(result.strip())
|
19 |
+
correct_solution = float(correct_solution.strip())
|
20 |
+
return abs(result - correct_solution) < tol
|
21 |
+
except:
|
22 |
+
return 0
|
23 |
+
|
24 |
+
|
25 |
+
final_accs = []
|
26 |
+
err_cnts = []
|
27 |
+
for i in range(2):
|
28 |
+
acc = 0
|
29 |
+
total = 0
|
30 |
+
err_cnt = 0
|
31 |
+
with open(args.path) as f:
|
32 |
+
for idx, line in enumerate(f):
|
33 |
+
if idx == args.max_line:
|
34 |
+
break
|
35 |
+
line = json.loads(line)
|
36 |
+
label = str(line["label"])
|
37 |
+
if i == 0:
|
38 |
+
response = line["response"]
|
39 |
+
else:
|
40 |
+
if line["logs"][0]["module"] == "Role Assigner":
|
41 |
+
response = line["logs"][1]["content"]
|
42 |
+
else:
|
43 |
+
response = line["logs"][0]["content"]
|
44 |
+
total += 1
|
45 |
+
result = re.findall(r"\\boxed\{(.+?)\}", response)
|
46 |
+
if len(result) == 0:
|
47 |
+
err_cnt += 1
|
48 |
+
# print(response)
|
49 |
+
continue
|
50 |
+
result = result[0]
|
51 |
+
result = re.sub(r"\\text\{.+\}?", "", result)
|
52 |
+
result = (
|
53 |
+
result.replace("rd", "")
|
54 |
+
.replace("nd", "")
|
55 |
+
.replace("st", "")
|
56 |
+
.replace("th", "")
|
57 |
+
.replace("House", "")
|
58 |
+
.replace("house", "")
|
59 |
+
.replace("\\", "")
|
60 |
+
)
|
61 |
+
|
62 |
+
# acc += check_corr(result, label)
|
63 |
+
try:
|
64 |
+
acc += int(result) == int(label)
|
65 |
+
except:
|
66 |
+
print(result)
|
67 |
+
|
68 |
+
final_accs.append(acc / total)
|
69 |
+
err_cnts.append(err_cnt)
|
70 |
+
print(final_accs)
|
71 |
+
print(err_cnts)
|
scripts/evaluate_math.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
import subprocess
|
4 |
+
from importlib import reload
|
5 |
+
from argparse import ArgumentParser
|
6 |
+
|
7 |
+
parser = ArgumentParser()
|
8 |
+
parser.add_argument("--path", type=str, required=True)
|
9 |
+
parser.add_argument("--max_line", type=int, default=1000000000000)
|
10 |
+
parser.add_argument("--ci_smoke_test", action="store_true")
|
11 |
+
args = parser.parse_args()
|
12 |
+
|
13 |
+
|
14 |
+
def check_corr(result: str, correct_solution: str, tol: float = 1e-3):
|
15 |
+
result = result.replace(",", "")
|
16 |
+
if result.strip() == correct_solution.strip():
|
17 |
+
return 1
|
18 |
+
try:
|
19 |
+
result = float(result.strip())
|
20 |
+
correct_solution = float(correct_solution.strip())
|
21 |
+
return abs(result - correct_solution) < tol
|
22 |
+
except:
|
23 |
+
return 0
|
24 |
+
|
25 |
+
|
26 |
+
# final_accs = []
|
27 |
+
# for i in range(2):
|
28 |
+
# acc = 0
|
29 |
+
# total = 0
|
30 |
+
# with open(args.path) as f:
|
31 |
+
# for line in f:
|
32 |
+
# line = json.loads(line)
|
33 |
+
# label = str(line["label"])
|
34 |
+
# if i == 0:
|
35 |
+
# code = line["response"]
|
36 |
+
# else:
|
37 |
+
# code = line["logs"][0]["content"]
|
38 |
+
# total += 1
|
39 |
+
# code = code.strip().replace("```", "")
|
40 |
+
# code = code.lstrip("python3")
|
41 |
+
# code = code.lstrip("python")
|
42 |
+
# with open("tmp.py", "w") as f:
|
43 |
+
# f.write(code)
|
44 |
+
|
45 |
+
# try:
|
46 |
+
# import tmp
|
47 |
+
|
48 |
+
# reload(tmp)
|
49 |
+
# result = str(tmp.solution())
|
50 |
+
# is_corr = check_corr(result, label)
|
51 |
+
|
52 |
+
# is_corr = int(is_corr)
|
53 |
+
# # Step 2
|
54 |
+
# if is_corr:
|
55 |
+
# acc += 1
|
56 |
+
# except:
|
57 |
+
# print(code)
|
58 |
+
# final_accs.append(acc / total)
|
59 |
+
# print(final_accs)
|
60 |
+
|
61 |
+
final_accs = []
|
62 |
+
err_cnts = []
|
63 |
+
for i in range(2):
|
64 |
+
acc = 0
|
65 |
+
total = 0
|
66 |
+
err_cnt = 0
|
67 |
+
with open(args.path) as f:
|
68 |
+
for idx, line in enumerate(f):
|
69 |
+
if idx == args.max_line:
|
70 |
+
break
|
71 |
+
line = json.loads(line)
|
72 |
+
label = str(line["label"])
|
73 |
+
if i == 0:
|
74 |
+
response = line["response"]
|
75 |
+
else:
|
76 |
+
if line["logs"][0]["module"] == "Role Assigner":
|
77 |
+
response = line["logs"][1]["content"]
|
78 |
+
else:
|
79 |
+
response = line["logs"][0]["content"]
|
80 |
+
total += 1
|
81 |
+
result = re.findall(r"\\boxed\{(.+?)\}", response)
|
82 |
+
if len(result) == 0:
|
83 |
+
err_cnt += 1
|
84 |
+
print(response)
|
85 |
+
continue
|
86 |
+
result = result[0]
|
87 |
+
acc += check_corr(result, label)
|
88 |
+
final_accs.append(acc / total)
|
89 |
+
err_cnts.append(err_cnt)
|
90 |
+
print(final_accs)
|
91 |
+
print(err_cnts)
|
92 |
+
if args.ci_smoke_test is True:
|
93 |
+
assert final_accs[0] == 1.0
|
scripts/evaluate_responsegen.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from string import Template
|
4 |
+
import time
|
5 |
+
import openai
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
with open("./results.jsonl", "r") as f:
|
9 |
+
lines = list(f.readlines())
|
10 |
+
|
11 |
+
eval_prompt = r"""Which response is better given this context:
|
12 |
+
${context}
|
13 |
+
|
14 |
+
Response A: ${response_a}
|
15 |
+
|
16 |
+
Response B: ${response_b}.
|
17 |
+
|
18 |
+
Pick your answer from ['Response A', 'Response B', 'both', 'neither']. Generate a short explanation for your choice first. Then, generate 'The better response is A' or 'The better response is B' or 'The better response is both' or 'The better response is neither'.
|
19 |
+
|
20 |
+
Your response format should be:
|
21 |
+
Explanation: <explanation>
|
22 |
+
Answer: ('The better response is A' or 'The better response is B' or 'The better response is both' or 'The better response is neither')
|
23 |
+
"""
|
24 |
+
|
25 |
+
res = []
|
26 |
+
eval = []
|
27 |
+
|
28 |
+
|
29 |
+
def write_eval_to_file(file, skip=0):
|
30 |
+
for idx, line in tqdm(enumerate(lines)):
|
31 |
+
if idx < skip:
|
32 |
+
continue
|
33 |
+
data = json.loads(line)
|
34 |
+
# print(idx + 1)
|
35 |
+
context = data["input"]
|
36 |
+
response_a = data["response"]
|
37 |
+
response_b = data["label"]
|
38 |
+
|
39 |
+
context_quote = "> " + "\n> ".join(context.split("\n"))
|
40 |
+
response_a_quote = "> " + "\n> ".join(response_a.split("\n"))
|
41 |
+
response_b_quote = "> " + "\n> ".join(response_b.split("\n"))
|
42 |
+
|
43 |
+
f.write(f"## {idx + 1}\n\n")
|
44 |
+
f.write(f"Context:\n" f"{context_quote}\n\n")
|
45 |
+
f.write(f"Response A (pipeline):\n" f"{response_a_quote}\n\n")
|
46 |
+
f.write(f"Response B (init):\n" f"{response_b_quote}\n\n")
|
47 |
+
|
48 |
+
prompt = Template(eval_prompt).safe_substitute(
|
49 |
+
context=context, response_a=response_a, response_b=response_b
|
50 |
+
)
|
51 |
+
for i in range(100):
|
52 |
+
try:
|
53 |
+
eval_response = openai.ChatCompletion.create(
|
54 |
+
model="gpt-4",
|
55 |
+
messages=[{"role": "user", "content": prompt}],
|
56 |
+
temperature=0.0,
|
57 |
+
)
|
58 |
+
except:
|
59 |
+
time.sleep(min(i**2, 60))
|
60 |
+
continue
|
61 |
+
break
|
62 |
+
text = eval_response["choices"][0]["message"]["content"]
|
63 |
+
eval.append(text)
|
64 |
+
text = text.replace("\n", "\n\n")
|
65 |
+
f.write(f"{text}\n\n")
|
66 |
+
|
67 |
+
if "The better response is A" in text:
|
68 |
+
res.append("A")
|
69 |
+
elif "The better response is B" in text:
|
70 |
+
res.append("B")
|
71 |
+
elif "The better response is both" in text:
|
72 |
+
res.append("both")
|
73 |
+
elif "The better response is neither" in text:
|
74 |
+
res.append("neither")
|
75 |
+
else:
|
76 |
+
res.append("unknown")
|
77 |
+
|
78 |
+
|
79 |
+
if not os.path.exists("./eval.md"):
|
80 |
+
with open("./eval.md", "w") as f:
|
81 |
+
f.write("# ResponseGen Eval\n\n")
|
82 |
+
write_eval_to_file(f)
|
83 |
+
win_cnt = 0
|
84 |
+
for r in res:
|
85 |
+
if r == "A":
|
86 |
+
win_cnt += 1
|
87 |
+
print(f"win rate: {win_cnt / len(res)}")
|
88 |
+
else:
|
89 |
+
win_cnt = 0
|
90 |
+
total_cnt = 0
|
91 |
+
with open("./eval.md", "r") as f:
|
92 |
+
for line in f:
|
93 |
+
if line.startswith("Answer"):
|
94 |
+
total_cnt += 1
|
95 |
+
if "The better response is A" in line:
|
96 |
+
res.append("A")
|
97 |
+
elif "The better response is B" in line:
|
98 |
+
res.append("B")
|
99 |
+
elif "The better response is both" in line:
|
100 |
+
res.append("both")
|
101 |
+
elif "The better response is neither" in line:
|
102 |
+
res.append("neither")
|
103 |
+
else:
|
104 |
+
res.append("unknown")
|
105 |
+
with open("./eval.md", "a") as f:
|
106 |
+
f.write("\n")
|
107 |
+
write_eval_to_file(f, total_cnt)
|
108 |
+
win_cnt = 0
|
109 |
+
for r in res:
|
110 |
+
if r == "A":
|
111 |
+
win_cnt += 1
|
112 |
+
print(f"win rate: {win_cnt / len(res)}")
|