File size: 2,745 Bytes
953bf3d
 
debfcf8
 
 
 
 
 
 
 
 
 
 
2f5c740
 
 
 
 
 
d7f1630
 
 
 
 
 
2f5c740
d7f1630
ce419f1
b1d6e77
d7f1630
 
730ef21
 
 
d552000
 
b463202
2f5c740
d7f1630
dfc8b82
debfcf8
 
b2b46dc
 
 
debfcf8
 
c75952a
debfcf8
2f5c740
 
9226230
80ae653
dfc8b82
b1d6e77
 
debfcf8
 
80ae653
 
debfcf8
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
import os

import pandas as pd
from datasets import load_dataset


from transformers import T5ForConditionalGeneration, T5Tokenizer
device = 'cpu' # if you have a GPU

tokenizer = T5Tokenizer.from_pretrained('stanfordnlp/SteamSHP-flan-t5-large')
model = T5ForConditionalGeneration.from_pretrained('stanfordnlp/SteamSHP-flan-t5-large').to(device)

model_list = [
 'google/flan-t5-xxl',
'bigscience/bloomz-7b1',
'facebook/opt-iml-max-30b',
'allenai/tk-instruct-11b-def-pos']

HF_TOKEN = os.getenv("HF_TOKEN")

OUTPUTS_DATASET = "HuggingFaceH4/instruction-pilot-outputs-filtered"

ds = load_dataset(OUTPUTS_DATASET, split="train", use_auth_token=HF_TOKEN)

def process(model_A, model_B):
    sample_ds = ds.shuffle().select(range(1))
    sample = sample_ds[0]
    prompt = sample["prompt"]
    
    df = pd.DataFrame.from_records(sample["filtered_outputs"])
    response_A_df = df[df['model']==model_A]["output"]
    response_B_df= df[df['model']==model_B]["output"]

    response_A = response_A_df.values[0]
    response_B = response_B_df.values[0]
    print(response_A)
    
        
    input_text = "POST: "+ prompt+ "\n\n RESPONSE A: "+response_A+"\n\n RESPONSE B: "+response_B+"\n\n Which response is better? RESPONSE"
    x = tokenizer([input_text], return_tensors='pt').input_ids.to(device)
    y = model.generate(x, max_new_tokens=1)
    prefered = tokenizer.batch_decode(y, skip_special_tokens=True)[0]
    result = model_A if prefered == 'A' else model_B
    return  prompt,df[df['model'].isin([model_A, model_B])], result

title = "Compare Instruction Models to see which one is more helpful"
description = "This app compares the outputs of various open-source, instruction-trained models from a [dataset](https://huggingface.co/datasets/{OUTPUTS_DATASET}) of human demonstrations using the SteamSHP reward model trained on the [Stanford Human Preferences Dataset (SHP)](https://huggingface.co/datasets/stanfordnlp/SHP).  Hit the button below to view a few random samples from the generated outputs"
interface = gr.Interface(fn=process, 
                     inputs=[gr.Dropdown(choices=model_list, value=model_list[0], label='Model A'),
                            gr.Dropdown(choices=model_list, value=model_list[1], label='Model B')],
                     outputs=[
                              gr.Textbox(label = "Prompt"),
                              gr.DataFrame(label = "Model Responses"),
                              gr.Textbox(label = "Preferred Option"),
                                 
                              ],
                     title=title,
                     description = description
                        
                     )
                     
interface.launch(debug=True)