Spaces:

AgentVerse
/

agentVerse

Build error

App Files Files Community

AgentVerse commited on Oct 15, 2023

Commit

7569f5d

1 Parent(s): 670a607

first commit

Browse files

Files changed (5) hide show

scripts/__init__.py +0 -0
scripts/evaluate_commongen.py +53 -0
scripts/evaluate_logic.py +71 -0
scripts/evaluate_math.py +93 -0
scripts/evaluate_responsegen.py +112 -0

scripts/__init__.py ADDED Viewed

File without changes

scripts/evaluate_commongen.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import argparse
+import json
+import spacy
+from tqdm import tqdm
+nlp = spacy.load("en_core_web_sm")
+def coverage_score(preds, concept_sets):
+    covs = []
+    missings = []
+    for p, cs in tqdm(zip(preds, concept_sets), total=len(preds)):
+        cs = set(cs)
+        lemmas = set()
+        for token in nlp(p):
+            lemmas.add(token.lemma_)
+        cov = len(lemmas & cs) / len(cs)
+        covs.append(cov)
+        missings.append(cs - lemmas)
+    return sum(covs) / len(covs), missings
+def scoring(preds, concept_sets):
+    # Scores, Coverage, Coverage_POS  = pivot_score.score(pred, ref, concept, ori_concepts, scoring="steiner_tree", parser="spacy", verbose=False)
+    coverage, missing_tokens = coverage_score(preds, concept_sets)
+    # print(f"System level Score: {sum(Scores)/len(Scores)*100:.2f}")
+    print(f"System level Coverage: {coverage*100:.2f}")
+    # print(f"System level Coverage_POS: {sum(Coverage_POS)/len(Scores)*100:.2f}")
+    return coverage, missing_tokens
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--path", default="", type=str)
+    args = parser.parse_args()
+    # nlp.pipeline = [("tagger", nlp.tagger), ("parser", nlp.parser)]
+    preds_final = []
+    preds_first = []
+    concept_sets = []
+    with open(args.path) as f:
+        for line in f:
+            line = json.loads(line)
+            preds_final.append(line["response"])
+            if line["logs"][0]["module"] == "Role Assigner":
+                preds_first.append(line["logs"][1]["content"])
+            else:
+                preds_first.append(line["logs"][0]["content"])
+            concept_sets.append(line["input"])
+    scoring(preds_final, concept_sets)
+    scoring(preds_first, concept_sets)

scripts/evaluate_logic.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import re
+import json
+import subprocess
+from importlib import reload
+from argparse import ArgumentParser
+parser = ArgumentParser()
+parser.add_argument("--path", type=str, required=True)
+parser.add_argument("--max_line", type=int, default=1000000000000)
+args = parser.parse_args()
+def check_corr(result: str, correct_solution: str, tol: float = 1e-3):
+    result = result.replace(",", "")
+    if result.strip() == correct_solution.strip():
+        return 1
+    try:
+        result = float(result.strip())
+        correct_solution = float(correct_solution.strip())
+        return abs(result - correct_solution) < tol
+    except:
+        return 0
+final_accs = []
+err_cnts = []
+for i in range(2):
+    acc = 0
+    total = 0
+    err_cnt = 0
+    with open(args.path) as f:
+        for idx, line in enumerate(f):
+            if idx == args.max_line:
+                break
+            line = json.loads(line)
+            label = str(line["label"])
+            if i == 0:
+                response = line["response"]
+            else:
+                if line["logs"][0]["module"] == "Role Assigner":
+                    response = line["logs"][1]["content"]
+                else:
+                    response = line["logs"][0]["content"]
+            total += 1
+            result = re.findall(r"\\boxed\{(.+?)\}", response)
+            if len(result) == 0:
+                err_cnt += 1
+                # print(response)
+                continue
+            result = result[0]
+            result = re.sub(r"\\text\{.+\}?", "", result)
+            result = (
+                result.replace("rd", "")
+                .replace("nd", "")
+                .replace("st", "")
+                .replace("th", "")
+                .replace("House", "")
+                .replace("house", "")
+                .replace("\\", "")
+            )
+            # acc += check_corr(result, label)
+            try:
+                acc += int(result) == int(label)
+            except:
+                print(result)
+    final_accs.append(acc / total)
+    err_cnts.append(err_cnt)
+print(final_accs)
+print(err_cnts)

scripts/evaluate_math.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import re
+import json
+import subprocess
+from importlib import reload
+from argparse import ArgumentParser
+parser = ArgumentParser()
+parser.add_argument("--path", type=str, required=True)
+parser.add_argument("--max_line", type=int, default=1000000000000)
+parser.add_argument("--ci_smoke_test", action="store_true")
+args = parser.parse_args()
+def check_corr(result: str, correct_solution: str, tol: float = 1e-3):
+    result = result.replace(",", "")
+    if result.strip() == correct_solution.strip():
+        return 1
+    try:
+        result = float(result.strip())
+        correct_solution = float(correct_solution.strip())
+        return abs(result - correct_solution) < tol
+    except:
+        return 0
+# final_accs = []
+# for i in range(2):
+#     acc = 0
+#     total = 0
+#     with open(args.path) as f:
+#         for line in f:
+#             line = json.loads(line)
+#             label = str(line["label"])
+#             if i == 0:
+#                 code = line["response"]
+#             else:
+#                 code = line["logs"][0]["content"]
+#             total += 1
+#             code = code.strip().replace("```", "")
+#             code = code.lstrip("python3")
+#             code = code.lstrip("python")
+#             with open("tmp.py", "w") as f:
+#                 f.write(code)
+#             try:
+#                 import tmp
+#                 reload(tmp)
+#                 result = str(tmp.solution())
+#                 is_corr = check_corr(result, label)
+#                 is_corr = int(is_corr)
+#                 # Step 2
+#                 if is_corr:
+#                     acc += 1
+#             except:
+#                 print(code)
+#     final_accs.append(acc / total)
+# print(final_accs)
+final_accs = []
+err_cnts = []
+for i in range(2):
+    acc = 0
+    total = 0
+    err_cnt = 0
+    with open(args.path) as f:
+        for idx, line in enumerate(f):
+            if idx == args.max_line:
+                break
+            line = json.loads(line)
+            label = str(line["label"])
+            if i == 0:
+                response = line["response"]
+            else:
+                if line["logs"][0]["module"] == "Role Assigner":
+                    response = line["logs"][1]["content"]
+                else:
+                    response = line["logs"][0]["content"]
+            total += 1
+            result = re.findall(r"\\boxed\{(.+?)\}", response)
+            if len(result) == 0:
+                err_cnt += 1
+                print(response)
+                continue
+            result = result[0]
+            acc += check_corr(result, label)
+    final_accs.append(acc / total)
+    err_cnts.append(err_cnt)
+print(final_accs)
+print(err_cnts)
+if args.ci_smoke_test is True:
+    assert final_accs[0] == 1.0

scripts/evaluate_responsegen.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+import json
+from string import Template
+import time
+import openai
+from tqdm import tqdm
+with open("./results.jsonl", "r") as f:
+    lines = list(f.readlines())
+eval_prompt = r"""Which response is better given this context:
+${context}
+Response A: ${response_a}
+Response B: ${response_b}.
+Pick your answer from ['Response A', 'Response B', 'both', 'neither']. Generate a short explanation for your choice first. Then, generate 'The better response is A' or 'The better response is B' or 'The better response is both' or 'The better response is neither'.
+Your response format should be:
+Explanation: <explanation>
+Answer: ('The better response is A' or 'The better response is B' or 'The better response is both' or 'The better response is neither')
+"""
+res = []
+eval = []
+def write_eval_to_file(file, skip=0):
+    for idx, line in tqdm(enumerate(lines)):
+        if idx < skip:
+            continue
+        data = json.loads(line)
+        # print(idx + 1)
+        context = data["input"]
+        response_a = data["response"]
+        response_b = data["label"]
+        context_quote = "> " + "\n> ".join(context.split("\n"))
+        response_a_quote = "> " + "\n> ".join(response_a.split("\n"))
+        response_b_quote = "> " + "\n> ".join(response_b.split("\n"))
+        f.write(f"## {idx + 1}\n\n")
+        f.write(f"Context:\n" f"{context_quote}\n\n")
+        f.write(f"Response A (pipeline):\n" f"{response_a_quote}\n\n")
+        f.write(f"Response B (init):\n" f"{response_b_quote}\n\n")
+        prompt = Template(eval_prompt).safe_substitute(
+            context=context, response_a=response_a, response_b=response_b
+        )
+        for i in range(100):
+            try:
+                eval_response = openai.ChatCompletion.create(
+                    model="gpt-4",
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0.0,
+                )
+            except:
+                time.sleep(min(i**2, 60))
+                continue
+            break
+        text = eval_response["choices"][0]["message"]["content"]
+        eval.append(text)
+        text = text.replace("\n", "\n\n")
+        f.write(f"{text}\n\n")
+        if "The better response is A" in text:
+            res.append("A")
+        elif "The better response is B" in text:
+            res.append("B")
+        elif "The better response is both" in text:
+            res.append("both")
+        elif "The better response is neither" in text:
+            res.append("neither")
+        else:
+            res.append("unknown")
+if not os.path.exists("./eval.md"):
+    with open("./eval.md", "w") as f:
+        f.write("# ResponseGen Eval\n\n")
+        write_eval_to_file(f)
+    win_cnt = 0
+    for r in res:
+        if r == "A":
+            win_cnt += 1
+    print(f"win rate: {win_cnt / len(res)}")
+else:
+    win_cnt = 0
+    total_cnt = 0
+    with open("./eval.md", "r") as f:
+        for line in f:
+            if line.startswith("Answer"):
+                total_cnt += 1
+                if "The better response is A" in line:
+                    res.append("A")
+                elif "The better response is B" in line:
+                    res.append("B")
+                elif "The better response is both" in line:
+                    res.append("both")
+                elif "The better response is neither" in line:
+                    res.append("neither")
+                else:
+                    res.append("unknown")
+    with open("./eval.md", "a") as f:
+        f.write("\n")
+        write_eval_to_file(f, total_cnt)
+    win_cnt = 0
+    for r in res:
+        if r == "A":
+            win_cnt += 1
+    print(f"win rate: {win_cnt / len(res)}")