Spaces:
Running
Running
pratyushmaini
commited on
Commit
·
7628397
1
Parent(s):
3d0f875
upload
Browse files- app.py +71 -23
- plotter.py +0 -80
- uploads.py +91 -0
app.py
CHANGED
@@ -1,6 +1,24 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
# Function to load data from a given CSV file
|
6 |
def load_data(model,version,metrics):
|
@@ -56,7 +74,7 @@ def change_version(model, version, metrics):
|
|
56 |
return new_df
|
57 |
|
58 |
# Function to create plots
|
59 |
-
from
|
60 |
|
61 |
# Initialize Gradio app
|
62 |
demo = gr.Blocks()
|
@@ -67,6 +85,15 @@ with demo:
|
|
67 |
The TOFU dataset is a benchmark designed to evaluate the unlearning performance of large language models in realistic scenarios. This unique dataset consists of question-answer pairs that are based on the autobiographies of 200 fictitious authors, entirely generated by the GPT-4 model. The primary objective of this task is to effectively unlearn a fine-tuned model using different portions of the forget set.
|
68 |
Read more at [https://locuslab.github.io/tofu/](https://locuslab.github.io/tofu/).
|
69 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
|
72 |
with gr.Tabs():
|
@@ -124,29 +151,40 @@ with demo:
|
|
124 |
inputs=[model_dropdown,version_dropdown,metrics_checkbox],
|
125 |
outputs=leaderboard_table
|
126 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
|
129 |
|
130 |
-
# # Dynamically update the choices for the methods checkbox
|
131 |
-
# def update_method_choices(version):
|
132 |
-
# df = load_data(version)
|
133 |
-
# methods = df['Method'].unique()
|
134 |
-
# methods_checkbox.update(choices=methods)
|
135 |
-
# return df
|
136 |
-
|
137 |
-
# version_dropdown_plots.change(
|
138 |
-
# update_method_choices,
|
139 |
-
# inputs=version_dropdown_plots,
|
140 |
-
# outputs=[methods_checkbox, plot_output]
|
141 |
-
# )
|
142 |
-
|
143 |
-
# methods_checkbox.change(
|
144 |
-
# create_plots,
|
145 |
-
# inputs=[methods_checkbox, leaderboard_table],
|
146 |
-
# outputs=plot_output
|
147 |
-
# )
|
148 |
-
|
149 |
-
# Launch the app
|
150 |
|
151 |
gr.Markdown("""
|
152 |
## Applicability 🚀
|
@@ -177,4 +215,14 @@ with demo:
|
|
177 |
How to push your results to the leaderboard?
|
178 |
|
179 |
""")
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
+
import os
|
4 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
+
from huggingface_hub import HfApi
|
6 |
+
from uploads import add_new_eval
|
7 |
+
|
8 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
9 |
+
CITATION_BUTTON_TEXT = r"""@misc{maini2024tofu,
|
10 |
+
title={TOFU: A Task of Fictitious Unlearning for LLMs},
|
11 |
+
author={Pratyush Maini and Zhili Feng and Avi Schwarzschild and Zachary Lipton and Zico Kolter},
|
12 |
+
year={2024},
|
13 |
+
archivePrefix={arXiv},
|
14 |
+
primaryClass={cs.LG}
|
15 |
+
}"""
|
16 |
+
|
17 |
+
api = HfApi()
|
18 |
+
TOKEN = os.environ.get("TOKEN", None)
|
19 |
+
LEADERBOARD_PATH = f"locuslab/tofu_leaderboard"
|
20 |
+
def restart_space():
|
21 |
+
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
|
22 |
|
23 |
# Function to load data from a given CSV file
|
24 |
def load_data(model,version,metrics):
|
|
|
74 |
return new_df
|
75 |
|
76 |
# Function to create plots
|
77 |
+
from uploads import create_plots
|
78 |
|
79 |
# Initialize Gradio app
|
80 |
demo = gr.Blocks()
|
|
|
85 |
The TOFU dataset is a benchmark designed to evaluate the unlearning performance of large language models in realistic scenarios. This unique dataset consists of question-answer pairs that are based on the autobiographies of 200 fictitious authors, entirely generated by the GPT-4 model. The primary objective of this task is to effectively unlearn a fine-tuned model using different portions of the forget set.
|
86 |
Read more at [https://locuslab.github.io/tofu/](https://locuslab.github.io/tofu/).
|
87 |
""")
|
88 |
+
|
89 |
+
with gr.Row():
|
90 |
+
with gr.Accordion("📙 Citation", open=False):
|
91 |
+
citation_button = gr.Textbox(
|
92 |
+
value=CITATION_BUTTON_TEXT,
|
93 |
+
label=CITATION_BUTTON_LABEL,
|
94 |
+
elem_id="citation-button",
|
95 |
+
) #.style(show_copy_button=True)
|
96 |
+
|
97 |
|
98 |
|
99 |
with gr.Tabs():
|
|
|
151 |
inputs=[model_dropdown,version_dropdown,metrics_checkbox],
|
152 |
outputs=leaderboard_table
|
153 |
)
|
154 |
+
|
155 |
+
with gr.Accordion("Submit a new model for evaluation"):
|
156 |
+
with gr.Row():
|
157 |
+
with gr.Column():
|
158 |
+
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
159 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
160 |
+
model_family_textbox = gr.Textbox(label="Model family")
|
161 |
+
system_prompt_textbox = gr.Textbox(label="System prompt example")
|
162 |
+
url_textbox = gr.Textbox(label="Url to model information")
|
163 |
+
with gr.Column():
|
164 |
+
organisation = gr.Textbox(label="Organisation")
|
165 |
+
mail = gr.Textbox(label="Contact email")
|
166 |
+
file_output = gr.File()
|
167 |
+
|
168 |
+
|
169 |
+
submit_button = gr.Button("Submit Eval")
|
170 |
+
submission_result = gr.Markdown()
|
171 |
+
submit_button.click(
|
172 |
+
add_new_eval,
|
173 |
+
[
|
174 |
+
level_of_test,
|
175 |
+
model_name_textbox,
|
176 |
+
model_family_textbox,
|
177 |
+
system_prompt_textbox,
|
178 |
+
url_textbox,
|
179 |
+
file_output,
|
180 |
+
organisation,
|
181 |
+
mail
|
182 |
+
],
|
183 |
+
submission_result,
|
184 |
+
)
|
185 |
|
186 |
|
187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
gr.Markdown("""
|
190 |
## Applicability 🚀
|
|
|
215 |
How to push your results to the leaderboard?
|
216 |
|
217 |
""")
|
218 |
+
|
219 |
+
# scheduler = BackgroundScheduler()
|
220 |
+
# scheduler.add_job(restart_space, "interval", seconds=1800)
|
221 |
+
# scheduler.start()
|
222 |
+
# demo.queue(default_concurrency_limit=40).launch()
|
223 |
+
|
224 |
+
# demo.launch()
|
225 |
+
scheduler = BackgroundScheduler()
|
226 |
+
scheduler.add_job(restart_space, "interval", seconds=3600)
|
227 |
+
scheduler.start()
|
228 |
+
demo.launch(debug=True)
|
plotter.py
DELETED
@@ -1,80 +0,0 @@
|
|
1 |
-
import seaborn as sns
|
2 |
-
import matplotlib.pyplot as plt
|
3 |
-
import pandas as pd
|
4 |
-
import numpy as np
|
5 |
-
import scipy.stats as stats
|
6 |
-
|
7 |
-
import warnings
|
8 |
-
warnings.simplefilter("ignore", category=Warning)
|
9 |
-
|
10 |
-
def custom_agg(x):
|
11 |
-
result = stats.hmean(x)
|
12 |
-
return result
|
13 |
-
|
14 |
-
def create_plots(big_df, selected_methods):
|
15 |
-
big_df = big_df[big_df['Method'].isin(selected_methods)]
|
16 |
-
# we want 1-Rouge-P
|
17 |
-
big_df["ROUGE-P Forget"] = 1 - big_df["ROUGE-P Forget"]
|
18 |
-
|
19 |
-
metrics = list(big_df.columns)
|
20 |
-
metrics.remove("Method")
|
21 |
-
metrics.remove("Model")
|
22 |
-
metrics.remove("Forget Rate")
|
23 |
-
metrics.remove("LR")
|
24 |
-
metrics.remove("Epoch")
|
25 |
-
metrics.remove("Compute")
|
26 |
-
|
27 |
-
print(metrics)
|
28 |
-
# Apply the custom aggregation function across each row, excluding the first column
|
29 |
-
row_custom_agg = big_df.iloc[:, -len(metrics):].apply(custom_agg, axis=1)
|
30 |
-
|
31 |
-
# If you want to add these results back to your original DataFrame
|
32 |
-
big_df['MAPO'] = row_custom_agg
|
33 |
-
big_df["LR"] = big_df["LR"].astype(float)
|
34 |
-
# big_df = big_df[big_df["LR"] >= 1e-5]
|
35 |
-
big_df["ROUGE-P Forget"] = 1 - big_df["ROUGE-P Forget"]
|
36 |
-
|
37 |
-
big_df.reset_index(inplace=True)
|
38 |
-
print(big_df[["Method", "Model", "Forget Rate", "LR", "Epoch", "ROUGE-P Forget", "MAPO"]].round(2).to_markdown())
|
39 |
-
|
40 |
-
# print(big_df.groupby(['Method', 'Model', 'Forget Rate']).head())
|
41 |
-
result = big_df.loc[big_df.groupby(['Method', 'Model', 'Forget Rate'])['MAPO'].idxmax()]
|
42 |
-
print(result[["Method", "Model", "Forget Rate", "LR", "Epoch", "MAPO"]].round(6).to_markdown())
|
43 |
-
# exit()
|
44 |
-
|
45 |
-
plot_legend = False
|
46 |
-
fs = 18 if plot_legend else 22
|
47 |
-
metrics.append("MAPO")
|
48 |
-
|
49 |
-
# Set the style of the visualization
|
50 |
-
sns.set_theme(style="whitegrid")
|
51 |
-
plt.rcParams['font.family'] = 'Times New Roman'
|
52 |
-
|
53 |
-
for metric_to_plot in metrics:
|
54 |
-
sub_df = result[big_df["Model"] == "Llama-2-7B"]
|
55 |
-
fig, ax = plt.subplots(figsize=(15, 5))
|
56 |
-
sns.barplot(x="Method", y=metric_to_plot, hue="Forget Rate", data=sub_df, ax=ax, legend=plot_legend)
|
57 |
-
ax.set_ylabel(metric_to_plot, fontsize=fs)
|
58 |
-
ax.set_ylim(0.0, 1.0)
|
59 |
-
ax.set_xlabel("", fontsize=fs)
|
60 |
-
ax.set_xticklabels(ax.get_xticklabels(), fontsize=fs)
|
61 |
-
ax.set_yticklabels(ax.get_yticklabels(), fontsize=fs-4)
|
62 |
-
ax.spines[['right', 'top']].set_visible(False)
|
63 |
-
if plot_legend:
|
64 |
-
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), title="Forget Rate (%)")
|
65 |
-
plt.title(metric_to_plot + " on Llama-2-7B", fontsize=fs)
|
66 |
-
plt.tight_layout()
|
67 |
-
plt.savefig(f"barplots/{metric_to_plot}-Llama-2-7B{'legend' if plot_legend else ''}.pdf")
|
68 |
-
print(f"\includegraphics[width=\\textwidth]{{figures/barplots/{metric_to_plot}-Llama-2-7B{'legend' if plot_legend else ''}.pdf}}")
|
69 |
-
plt.close(fig)
|
70 |
-
|
71 |
-
for model in ["Llama-2-7B", "Phi"]:
|
72 |
-
sub_df = result[result["Model"] == model][["Method", "Forget Rate", "MAPO"]]
|
73 |
-
# print(sub_df.round(6).to_latex(index=False))
|
74 |
-
sub_df.reset_index(inplace=True)
|
75 |
-
|
76 |
-
# Reorienting the dataframe
|
77 |
-
sub_df_reoriented = sub_df.pivot(index="Method", columns='Forget Rate', values='MAPO')
|
78 |
-
|
79 |
-
# Output a latex table of the MAPO values by Method and Forget Rate
|
80 |
-
print(sub_df_reoriented.round(4).to_latex(index=True))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uploads.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from email.utils import parseaddr
|
2 |
+
from huggingface_hub import HfApi
|
3 |
+
import os
|
4 |
+
import datetime
|
5 |
+
|
6 |
+
|
7 |
+
OWNER="locuslab"
|
8 |
+
SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
|
9 |
+
RESULTS_DATASET = f"{OWNER}/results_public"
|
10 |
+
LEADERBOARD_PATH = f"{OWNER}/tofu_leaderboard"
|
11 |
+
api = HfApi()
|
12 |
+
TOKEN = os.environ.get("TOKEN", None)
|
13 |
+
YEAR_VERSION = "2024"
|
14 |
+
|
15 |
+
def format_error(msg):
|
16 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
|
17 |
+
|
18 |
+
def format_warning(msg):
|
19 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
|
20 |
+
|
21 |
+
def format_log(msg):
|
22 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
|
23 |
+
|
24 |
+
def model_hyperlink(link, model_name):
|
25 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
def add_new_eval(
|
30 |
+
val_or_test: str,
|
31 |
+
model: str,
|
32 |
+
model_family: str,
|
33 |
+
system_prompt: str,
|
34 |
+
url: str,
|
35 |
+
path_to_file: str,
|
36 |
+
organisation: str,
|
37 |
+
mail: str,
|
38 |
+
):
|
39 |
+
# Very basic email parsing
|
40 |
+
_, parsed_mail = parseaddr(mail)
|
41 |
+
if not "@" in parsed_mail:
|
42 |
+
return format_warning("Please provide a valid email adress.")
|
43 |
+
|
44 |
+
print("Adding new eval")
|
45 |
+
|
46 |
+
# Check if the combination model/org already exists and prints a warning message if yes
|
47 |
+
# if model.lower() in set(eval_results[val_or_test]["model"]) and organisation.lower() in set(eval_results[val_or_test]["organisation"]):
|
48 |
+
# return format_warning("This model has been already submitted.")
|
49 |
+
|
50 |
+
if path_to_file is None:
|
51 |
+
return format_warning("Please attach a file.")
|
52 |
+
|
53 |
+
# Save submitted file
|
54 |
+
api.upload_file(
|
55 |
+
repo_id=SUBMISSION_DATASET,
|
56 |
+
path_or_fileobj=path_to_file.name,
|
57 |
+
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
|
58 |
+
repo_type="dataset",
|
59 |
+
token=TOKEN
|
60 |
+
)
|
61 |
+
|
62 |
+
# Compute score
|
63 |
+
|
64 |
+
|
65 |
+
# Save scored file
|
66 |
+
api.upload_file(
|
67 |
+
repo_id=SUBMISSION_DATASET,
|
68 |
+
path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
|
69 |
+
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
|
70 |
+
repo_type="dataset",
|
71 |
+
token=TOKEN
|
72 |
+
)
|
73 |
+
|
74 |
+
# Actual submission
|
75 |
+
eval_entry = {
|
76 |
+
"model": model,
|
77 |
+
"model_family": model_family,
|
78 |
+
"system_prompt": system_prompt,
|
79 |
+
"url": url,
|
80 |
+
"organisation": organisation,
|
81 |
+
"mail": mail,
|
82 |
+
# "score": scores["all"]/num_questions["all"],
|
83 |
+
# "score_level1": scores[1]/num_questions[1],
|
84 |
+
# "score_level2": scores[2]/num_questions[2],
|
85 |
+
# "score_level3": scores[3]/num_questions[3],
|
86 |
+
}
|
87 |
+
# eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
|
88 |
+
# print(eval_results)
|
89 |
+
# eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
|
90 |
+
|
91 |
+
return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
|