pratyushmaini commited on
Commit
7628397
·
1 Parent(s): 3d0f875
Files changed (3) hide show
  1. app.py +71 -23
  2. plotter.py +0 -80
  3. uploads.py +91 -0
app.py CHANGED
@@ -1,6 +1,24 @@
1
  import gradio as gr
2
  import pandas as pd
3
- import matplotlib.pyplot as plt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  # Function to load data from a given CSV file
6
  def load_data(model,version,metrics):
@@ -56,7 +74,7 @@ def change_version(model, version, metrics):
56
  return new_df
57
 
58
  # Function to create plots
59
- from plotter import create_plots
60
 
61
  # Initialize Gradio app
62
  demo = gr.Blocks()
@@ -67,6 +85,15 @@ with demo:
67
  The TOFU dataset is a benchmark designed to evaluate the unlearning performance of large language models in realistic scenarios. This unique dataset consists of question-answer pairs that are based on the autobiographies of 200 fictitious authors, entirely generated by the GPT-4 model. The primary objective of this task is to effectively unlearn a fine-tuned model using different portions of the forget set.
68
  Read more at [https://locuslab.github.io/tofu/](https://locuslab.github.io/tofu/).
69
  """)
 
 
 
 
 
 
 
 
 
70
 
71
 
72
  with gr.Tabs():
@@ -124,29 +151,40 @@ with demo:
124
  inputs=[model_dropdown,version_dropdown,metrics_checkbox],
125
  outputs=leaderboard_table
126
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
 
129
 
130
- # # Dynamically update the choices for the methods checkbox
131
- # def update_method_choices(version):
132
- # df = load_data(version)
133
- # methods = df['Method'].unique()
134
- # methods_checkbox.update(choices=methods)
135
- # return df
136
-
137
- # version_dropdown_plots.change(
138
- # update_method_choices,
139
- # inputs=version_dropdown_plots,
140
- # outputs=[methods_checkbox, plot_output]
141
- # )
142
-
143
- # methods_checkbox.change(
144
- # create_plots,
145
- # inputs=[methods_checkbox, leaderboard_table],
146
- # outputs=plot_output
147
- # )
148
-
149
- # Launch the app
150
 
151
  gr.Markdown("""
152
  ## Applicability 🚀
@@ -177,4 +215,14 @@ with demo:
177
  How to push your results to the leaderboard?
178
 
179
  """)
180
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import os
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import HfApi
6
+ from uploads import add_new_eval
7
+
8
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
9
+ CITATION_BUTTON_TEXT = r"""@misc{maini2024tofu,
10
+ title={TOFU: A Task of Fictitious Unlearning for LLMs},
11
+ author={Pratyush Maini and Zhili Feng and Avi Schwarzschild and Zachary Lipton and Zico Kolter},
12
+ year={2024},
13
+ archivePrefix={arXiv},
14
+ primaryClass={cs.LG}
15
+ }"""
16
+
17
+ api = HfApi()
18
+ TOKEN = os.environ.get("TOKEN", None)
19
+ LEADERBOARD_PATH = f"locuslab/tofu_leaderboard"
20
+ def restart_space():
21
+ api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
22
 
23
  # Function to load data from a given CSV file
24
  def load_data(model,version,metrics):
 
74
  return new_df
75
 
76
  # Function to create plots
77
+ from uploads import create_plots
78
 
79
  # Initialize Gradio app
80
  demo = gr.Blocks()
 
85
  The TOFU dataset is a benchmark designed to evaluate the unlearning performance of large language models in realistic scenarios. This unique dataset consists of question-answer pairs that are based on the autobiographies of 200 fictitious authors, entirely generated by the GPT-4 model. The primary objective of this task is to effectively unlearn a fine-tuned model using different portions of the forget set.
86
  Read more at [https://locuslab.github.io/tofu/](https://locuslab.github.io/tofu/).
87
  """)
88
+
89
+ with gr.Row():
90
+ with gr.Accordion("📙 Citation", open=False):
91
+ citation_button = gr.Textbox(
92
+ value=CITATION_BUTTON_TEXT,
93
+ label=CITATION_BUTTON_LABEL,
94
+ elem_id="citation-button",
95
+ ) #.style(show_copy_button=True)
96
+
97
 
98
 
99
  with gr.Tabs():
 
151
  inputs=[model_dropdown,version_dropdown,metrics_checkbox],
152
  outputs=leaderboard_table
153
  )
154
+
155
+ with gr.Accordion("Submit a new model for evaluation"):
156
+ with gr.Row():
157
+ with gr.Column():
158
+ level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
159
+ model_name_textbox = gr.Textbox(label="Model name")
160
+ model_family_textbox = gr.Textbox(label="Model family")
161
+ system_prompt_textbox = gr.Textbox(label="System prompt example")
162
+ url_textbox = gr.Textbox(label="Url to model information")
163
+ with gr.Column():
164
+ organisation = gr.Textbox(label="Organisation")
165
+ mail = gr.Textbox(label="Contact email")
166
+ file_output = gr.File()
167
+
168
+
169
+ submit_button = gr.Button("Submit Eval")
170
+ submission_result = gr.Markdown()
171
+ submit_button.click(
172
+ add_new_eval,
173
+ [
174
+ level_of_test,
175
+ model_name_textbox,
176
+ model_family_textbox,
177
+ system_prompt_textbox,
178
+ url_textbox,
179
+ file_output,
180
+ organisation,
181
+ mail
182
+ ],
183
+ submission_result,
184
+ )
185
 
186
 
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  gr.Markdown("""
190
  ## Applicability 🚀
 
215
  How to push your results to the leaderboard?
216
 
217
  """)
218
+
219
+ # scheduler = BackgroundScheduler()
220
+ # scheduler.add_job(restart_space, "interval", seconds=1800)
221
+ # scheduler.start()
222
+ # demo.queue(default_concurrency_limit=40).launch()
223
+
224
+ # demo.launch()
225
+ scheduler = BackgroundScheduler()
226
+ scheduler.add_job(restart_space, "interval", seconds=3600)
227
+ scheduler.start()
228
+ demo.launch(debug=True)
plotter.py DELETED
@@ -1,80 +0,0 @@
1
- import seaborn as sns
2
- import matplotlib.pyplot as plt
3
- import pandas as pd
4
- import numpy as np
5
- import scipy.stats as stats
6
-
7
- import warnings
8
- warnings.simplefilter("ignore", category=Warning)
9
-
10
- def custom_agg(x):
11
- result = stats.hmean(x)
12
- return result
13
-
14
- def create_plots(big_df, selected_methods):
15
- big_df = big_df[big_df['Method'].isin(selected_methods)]
16
- # we want 1-Rouge-P
17
- big_df["ROUGE-P Forget"] = 1 - big_df["ROUGE-P Forget"]
18
-
19
- metrics = list(big_df.columns)
20
- metrics.remove("Method")
21
- metrics.remove("Model")
22
- metrics.remove("Forget Rate")
23
- metrics.remove("LR")
24
- metrics.remove("Epoch")
25
- metrics.remove("Compute")
26
-
27
- print(metrics)
28
- # Apply the custom aggregation function across each row, excluding the first column
29
- row_custom_agg = big_df.iloc[:, -len(metrics):].apply(custom_agg, axis=1)
30
-
31
- # If you want to add these results back to your original DataFrame
32
- big_df['MAPO'] = row_custom_agg
33
- big_df["LR"] = big_df["LR"].astype(float)
34
- # big_df = big_df[big_df["LR"] >= 1e-5]
35
- big_df["ROUGE-P Forget"] = 1 - big_df["ROUGE-P Forget"]
36
-
37
- big_df.reset_index(inplace=True)
38
- print(big_df[["Method", "Model", "Forget Rate", "LR", "Epoch", "ROUGE-P Forget", "MAPO"]].round(2).to_markdown())
39
-
40
- # print(big_df.groupby(['Method', 'Model', 'Forget Rate']).head())
41
- result = big_df.loc[big_df.groupby(['Method', 'Model', 'Forget Rate'])['MAPO'].idxmax()]
42
- print(result[["Method", "Model", "Forget Rate", "LR", "Epoch", "MAPO"]].round(6).to_markdown())
43
- # exit()
44
-
45
- plot_legend = False
46
- fs = 18 if plot_legend else 22
47
- metrics.append("MAPO")
48
-
49
- # Set the style of the visualization
50
- sns.set_theme(style="whitegrid")
51
- plt.rcParams['font.family'] = 'Times New Roman'
52
-
53
- for metric_to_plot in metrics:
54
- sub_df = result[big_df["Model"] == "Llama-2-7B"]
55
- fig, ax = plt.subplots(figsize=(15, 5))
56
- sns.barplot(x="Method", y=metric_to_plot, hue="Forget Rate", data=sub_df, ax=ax, legend=plot_legend)
57
- ax.set_ylabel(metric_to_plot, fontsize=fs)
58
- ax.set_ylim(0.0, 1.0)
59
- ax.set_xlabel("", fontsize=fs)
60
- ax.set_xticklabels(ax.get_xticklabels(), fontsize=fs)
61
- ax.set_yticklabels(ax.get_yticklabels(), fontsize=fs-4)
62
- ax.spines[['right', 'top']].set_visible(False)
63
- if plot_legend:
64
- plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), title="Forget Rate (%)")
65
- plt.title(metric_to_plot + " on Llama-2-7B", fontsize=fs)
66
- plt.tight_layout()
67
- plt.savefig(f"barplots/{metric_to_plot}-Llama-2-7B{'legend' if plot_legend else ''}.pdf")
68
- print(f"\includegraphics[width=\\textwidth]{{figures/barplots/{metric_to_plot}-Llama-2-7B{'legend' if plot_legend else ''}.pdf}}")
69
- plt.close(fig)
70
-
71
- for model in ["Llama-2-7B", "Phi"]:
72
- sub_df = result[result["Model"] == model][["Method", "Forget Rate", "MAPO"]]
73
- # print(sub_df.round(6).to_latex(index=False))
74
- sub_df.reset_index(inplace=True)
75
-
76
- # Reorienting the dataframe
77
- sub_df_reoriented = sub_df.pivot(index="Method", columns='Forget Rate', values='MAPO')
78
-
79
- # Output a latex table of the MAPO values by Method and Forget Rate
80
- print(sub_df_reoriented.round(4).to_latex(index=True))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
uploads.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from email.utils import parseaddr
2
+ from huggingface_hub import HfApi
3
+ import os
4
+ import datetime
5
+
6
+
7
+ OWNER="locuslab"
8
+ SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
9
+ RESULTS_DATASET = f"{OWNER}/results_public"
10
+ LEADERBOARD_PATH = f"{OWNER}/tofu_leaderboard"
11
+ api = HfApi()
12
+ TOKEN = os.environ.get("TOKEN", None)
13
+ YEAR_VERSION = "2024"
14
+
15
+ def format_error(msg):
16
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
17
+
18
+ def format_warning(msg):
19
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
20
+
21
+ def format_log(msg):
22
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
23
+
24
+ def model_hyperlink(link, model_name):
25
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
26
+
27
+
28
+
29
+ def add_new_eval(
30
+ val_or_test: str,
31
+ model: str,
32
+ model_family: str,
33
+ system_prompt: str,
34
+ url: str,
35
+ path_to_file: str,
36
+ organisation: str,
37
+ mail: str,
38
+ ):
39
+ # Very basic email parsing
40
+ _, parsed_mail = parseaddr(mail)
41
+ if not "@" in parsed_mail:
42
+ return format_warning("Please provide a valid email adress.")
43
+
44
+ print("Adding new eval")
45
+
46
+ # Check if the combination model/org already exists and prints a warning message if yes
47
+ # if model.lower() in set(eval_results[val_or_test]["model"]) and organisation.lower() in set(eval_results[val_or_test]["organisation"]):
48
+ # return format_warning("This model has been already submitted.")
49
+
50
+ if path_to_file is None:
51
+ return format_warning("Please attach a file.")
52
+
53
+ # Save submitted file
54
+ api.upload_file(
55
+ repo_id=SUBMISSION_DATASET,
56
+ path_or_fileobj=path_to_file.name,
57
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
58
+ repo_type="dataset",
59
+ token=TOKEN
60
+ )
61
+
62
+ # Compute score
63
+
64
+
65
+ # Save scored file
66
+ api.upload_file(
67
+ repo_id=SUBMISSION_DATASET,
68
+ path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
69
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
70
+ repo_type="dataset",
71
+ token=TOKEN
72
+ )
73
+
74
+ # Actual submission
75
+ eval_entry = {
76
+ "model": model,
77
+ "model_family": model_family,
78
+ "system_prompt": system_prompt,
79
+ "url": url,
80
+ "organisation": organisation,
81
+ "mail": mail,
82
+ # "score": scores["all"]/num_questions["all"],
83
+ # "score_level1": scores[1]/num_questions[1],
84
+ # "score_level2": scores[2]/num_questions[2],
85
+ # "score_level3": scores[3]/num_questions[3],
86
+ }
87
+ # eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
88
+ # print(eval_results)
89
+ # eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
90
+
91
+ return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")