File size: 3,690 Bytes
7569f5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import json
from string import Template
import time
import openai
from tqdm import tqdm

with open("./results.jsonl", "r") as f:
    lines = list(f.readlines())

eval_prompt = r"""Which response is better given this context: 
${context}

Response A: ${response_a} 

Response B: ${response_b}. 

Pick your answer from ['Response A', 'Response B', 'both', 'neither']. Generate a short explanation for your choice first. Then, generate 'The better response is A' or 'The better response is B' or 'The better response is both' or 'The better response is neither'.

Your response format should be:
Explanation: <explanation>
Answer: ('The better response is A' or 'The better response is B' or 'The better response is both' or 'The better response is neither')
"""

res = []
eval = []


def write_eval_to_file(file, skip=0):
    for idx, line in tqdm(enumerate(lines)):
        if idx < skip:
            continue
        data = json.loads(line)
        # print(idx + 1)
        context = data["input"]
        response_a = data["response"]
        response_b = data["label"]

        context_quote = "> " + "\n> ".join(context.split("\n"))
        response_a_quote = "> " + "\n> ".join(response_a.split("\n"))
        response_b_quote = "> " + "\n> ".join(response_b.split("\n"))

        f.write(f"## {idx + 1}\n\n")
        f.write(f"Context:\n" f"{context_quote}\n\n")
        f.write(f"Response A (pipeline):\n" f"{response_a_quote}\n\n")
        f.write(f"Response B (init):\n" f"{response_b_quote}\n\n")

        prompt = Template(eval_prompt).safe_substitute(
            context=context, response_a=response_a, response_b=response_b
        )
        for i in range(100):
            try:
                eval_response = openai.ChatCompletion.create(
                    model="gpt-4",
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.0,
                )
            except:
                time.sleep(min(i**2, 60))
                continue
            break
        text = eval_response["choices"][0]["message"]["content"]
        eval.append(text)
        text = text.replace("\n", "\n\n")
        f.write(f"{text}\n\n")

        if "The better response is A" in text:
            res.append("A")
        elif "The better response is B" in text:
            res.append("B")
        elif "The better response is both" in text:
            res.append("both")
        elif "The better response is neither" in text:
            res.append("neither")
        else:
            res.append("unknown")


if not os.path.exists("./eval.md"):
    with open("./eval.md", "w") as f:
        f.write("# ResponseGen Eval\n\n")
        write_eval_to_file(f)
    win_cnt = 0
    for r in res:
        if r == "A":
            win_cnt += 1
    print(f"win rate: {win_cnt / len(res)}")
else:
    win_cnt = 0
    total_cnt = 0
    with open("./eval.md", "r") as f:
        for line in f:
            if line.startswith("Answer"):
                total_cnt += 1
                if "The better response is A" in line:
                    res.append("A")
                elif "The better response is B" in line:
                    res.append("B")
                elif "The better response is both" in line:
                    res.append("both")
                elif "The better response is neither" in line:
                    res.append("neither")
                else:
                    res.append("unknown")
    with open("./eval.md", "a") as f:
        f.write("\n")
        write_eval_to_file(f, total_cnt)
    win_cnt = 0
    for r in res:
        if r == "A":
            win_cnt += 1
    print(f"win rate: {win_cnt / len(res)}")