Spaces:
Build error
Build error
File size: 3,690 Bytes
7569f5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import os
import json
from string import Template
import time
import openai
from tqdm import tqdm
with open("./results.jsonl", "r") as f:
lines = list(f.readlines())
eval_prompt = r"""Which response is better given this context:
${context}
Response A: ${response_a}
Response B: ${response_b}.
Pick your answer from ['Response A', 'Response B', 'both', 'neither']. Generate a short explanation for your choice first. Then, generate 'The better response is A' or 'The better response is B' or 'The better response is both' or 'The better response is neither'.
Your response format should be:
Explanation: <explanation>
Answer: ('The better response is A' or 'The better response is B' or 'The better response is both' or 'The better response is neither')
"""
res = []
eval = []
def write_eval_to_file(file, skip=0):
for idx, line in tqdm(enumerate(lines)):
if idx < skip:
continue
data = json.loads(line)
# print(idx + 1)
context = data["input"]
response_a = data["response"]
response_b = data["label"]
context_quote = "> " + "\n> ".join(context.split("\n"))
response_a_quote = "> " + "\n> ".join(response_a.split("\n"))
response_b_quote = "> " + "\n> ".join(response_b.split("\n"))
f.write(f"## {idx + 1}\n\n")
f.write(f"Context:\n" f"{context_quote}\n\n")
f.write(f"Response A (pipeline):\n" f"{response_a_quote}\n\n")
f.write(f"Response B (init):\n" f"{response_b_quote}\n\n")
prompt = Template(eval_prompt).safe_substitute(
context=context, response_a=response_a, response_b=response_b
)
for i in range(100):
try:
eval_response = openai.ChatCompletion.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
)
except:
time.sleep(min(i**2, 60))
continue
break
text = eval_response["choices"][0]["message"]["content"]
eval.append(text)
text = text.replace("\n", "\n\n")
f.write(f"{text}\n\n")
if "The better response is A" in text:
res.append("A")
elif "The better response is B" in text:
res.append("B")
elif "The better response is both" in text:
res.append("both")
elif "The better response is neither" in text:
res.append("neither")
else:
res.append("unknown")
if not os.path.exists("./eval.md"):
with open("./eval.md", "w") as f:
f.write("# ResponseGen Eval\n\n")
write_eval_to_file(f)
win_cnt = 0
for r in res:
if r == "A":
win_cnt += 1
print(f"win rate: {win_cnt / len(res)}")
else:
win_cnt = 0
total_cnt = 0
with open("./eval.md", "r") as f:
for line in f:
if line.startswith("Answer"):
total_cnt += 1
if "The better response is A" in line:
res.append("A")
elif "The better response is B" in line:
res.append("B")
elif "The better response is both" in line:
res.append("both")
elif "The better response is neither" in line:
res.append("neither")
else:
res.append("unknown")
with open("./eval.md", "a") as f:
f.write("\n")
write_eval_to_file(f, total_cnt)
win_cnt = 0
for r in res:
if r == "A":
win_cnt += 1
print(f"win rate: {win_cnt / len(res)}")
|