Spaces:

locuslab
/

tofu_leaderboard

Running

App Files Files Community

pratyushmaini commited on Jan 9, 2024

Commit

7628397

1 Parent(s): 3d0f875

upload

Browse files

Files changed (3) hide show

app.py +71 -23
plotter.py +0 -80
uploads.py +91 -0

app.py CHANGED Viewed

@@ -1,6 +1,24 @@
 import gradio as gr
 import pandas as pd
-import matplotlib.pyplot as plt
 # Function to load data from a given CSV file
 def load_data(model,version,metrics):
@@ -56,7 +74,7 @@ def change_version(model, version, metrics):
     return new_df
 # Function to create plots
-from plotter import create_plots
 # Initialize Gradio app
 demo = gr.Blocks()
@@ -67,6 +85,15 @@ with demo:
     The TOFU dataset is a benchmark designed to evaluate the unlearning performance of large language models in realistic scenarios. This unique dataset consists of question-answer pairs that are based on the autobiographies of 200 fictitious authors, entirely generated by the GPT-4 model. The primary objective of this task is to effectively unlearn a fine-tuned model using different portions of the forget set.
     Read more at [https://locuslab.github.io/tofu/](https://locuslab.github.io/tofu/).
     """)
     with gr.Tabs():
@@ -124,29 +151,40 @@ with demo:
                 inputs=[model_dropdown,version_dropdown,metrics_checkbox],
                 outputs=leaderboard_table
             )
-        #     # Dynamically update the choices for the methods checkbox
-        #     def update_method_choices(version):
-        #         df = load_data(version)
-        #         methods = df['Method'].unique()
-        #         methods_checkbox.update(choices=methods)
-        #         return df
-        #     version_dropdown_plots.change(
-        #         update_method_choices,
-        #         inputs=version_dropdown_plots,
-        #         outputs=[methods_checkbox, plot_output]
-        #     )
-        #     methods_checkbox.change(
-        #         create_plots,
-        #         inputs=[methods_checkbox, leaderboard_table],
-        #         outputs=plot_output
-        #     )
-# Launch the app
     gr.Markdown("""
     ## Applicability 🚀
@@ -177,4 +215,14 @@ with demo:
     How to push your results to the leaderboard?
     """)
-demo.launch()

 import gradio as gr
 import pandas as pd
+import os
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import HfApi
+from uploads import add_new_eval
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@misc{maini2024tofu,
+      title={TOFU: A Task of Fictitious Unlearning for LLMs},
+      author={Pratyush Maini and Zhili Feng and Avi Schwarzschild and Zachary Lipton and Zico Kolter},
+      year={2024},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}"""
+api = HfApi()
+TOKEN = os.environ.get("TOKEN", None)
+LEADERBOARD_PATH = f"locuslab/tofu_leaderboard"
+def restart_space():
+    api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
 # Function to load data from a given CSV file
 def load_data(model,version,metrics):
     return new_df
 # Function to create plots
+from uploads import create_plots
 # Initialize Gradio app
 demo = gr.Blocks()
     The TOFU dataset is a benchmark designed to evaluate the unlearning performance of large language models in realistic scenarios. This unique dataset consists of question-answer pairs that are based on the autobiographies of 200 fictitious authors, entirely generated by the GPT-4 model. The primary objective of this task is to effectively unlearn a fine-tuned model using different portions of the forget set.
     Read more at [https://locuslab.github.io/tofu/](https://locuslab.github.io/tofu/).
     """)
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+            ) #.style(show_copy_button=True)
     with gr.Tabs():
                 inputs=[model_dropdown,version_dropdown,metrics_checkbox],
                 outputs=leaderboard_table
             )
+    with gr.Accordion("Submit a new model for evaluation"):
+        with gr.Row():
+            with gr.Column():
+                level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
+                model_name_textbox = gr.Textbox(label="Model name")
+                model_family_textbox = gr.Textbox(label="Model family")
+                system_prompt_textbox = gr.Textbox(label="System prompt example")
+                url_textbox = gr.Textbox(label="Url to model information")
+            with gr.Column():
+                organisation = gr.Textbox(label="Organisation")
+                mail = gr.Textbox(label="Contact email")
+                file_output = gr.File()
+        submit_button = gr.Button("Submit Eval")
+        submission_result = gr.Markdown()
+        submit_button.click(
+            add_new_eval,
+            [
+                level_of_test,
+                model_name_textbox,
+                model_family_textbox,
+                system_prompt_textbox,
+                url_textbox,
+                file_output,
+                organisation,
+                mail
+            ],
+            submission_result,
+        )
     gr.Markdown("""
     ## Applicability 🚀
     How to push your results to the leaderboard?
     """)
+# scheduler = BackgroundScheduler()
+# scheduler.add_job(restart_space, "interval", seconds=1800)
+# scheduler.start()
+# demo.queue(default_concurrency_limit=40).launch()
+# demo.launch()
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=3600)
+scheduler.start()
+demo.launch(debug=True)

plotter.py DELETED Viewed

@@ -1,80 +0,0 @@
-import seaborn as sns
-import matplotlib.pyplot as plt
-import pandas as pd
-import numpy as np
-import scipy.stats as stats
-import warnings
-warnings.simplefilter("ignore", category=Warning)
-def custom_agg(x):
-    result = stats.hmean(x)
-    return result
-def create_plots(big_df, selected_methods):
-    big_df = big_df[big_df['Method'].isin(selected_methods)]
-    # we want 1-Rouge-P
-    big_df["ROUGE-P Forget"] = 1 - big_df["ROUGE-P Forget"]
-    metrics = list(big_df.columns)
-    metrics.remove("Method")
-    metrics.remove("Model")
-    metrics.remove("Forget Rate")
-    metrics.remove("LR")
-    metrics.remove("Epoch")
-    metrics.remove("Compute")
-    print(metrics)
-    # Apply the custom aggregation function across each row, excluding the first column
-    row_custom_agg = big_df.iloc[:, -len(metrics):].apply(custom_agg, axis=1)
-    # If you want to add these results back to your original DataFrame
-    big_df['MAPO'] = row_custom_agg
-    big_df["LR"] = big_df["LR"].astype(float)
-    # big_df = big_df[big_df["LR"] >= 1e-5]
-    big_df["ROUGE-P Forget"] = 1 - big_df["ROUGE-P Forget"]
-    big_df.reset_index(inplace=True)
-    print(big_df[["Method", "Model", "Forget Rate", "LR", "Epoch", "ROUGE-P Forget", "MAPO"]].round(2).to_markdown())
-    # print(big_df.groupby(['Method', 'Model', 'Forget Rate']).head())
-    result = big_df.loc[big_df.groupby(['Method', 'Model', 'Forget Rate'])['MAPO'].idxmax()]
-    print(result[["Method", "Model", "Forget Rate", "LR", "Epoch", "MAPO"]].round(6).to_markdown())
-    # exit()
-    plot_legend = False
-    fs = 18 if plot_legend else 22
-    metrics.append("MAPO")
-    # Set the style of the visualization
-    sns.set_theme(style="whitegrid")
-    plt.rcParams['font.family'] = 'Times New Roman'
-    for metric_to_plot in metrics:
-        sub_df = result[big_df["Model"] == "Llama-2-7B"]
-        fig, ax = plt.subplots(figsize=(15, 5))
-        sns.barplot(x="Method", y=metric_to_plot, hue="Forget Rate", data=sub_df, ax=ax, legend=plot_legend)
-        ax.set_ylabel(metric_to_plot, fontsize=fs)
-        ax.set_ylim(0.0, 1.0)
-        ax.set_xlabel("", fontsize=fs)
-        ax.set_xticklabels(ax.get_xticklabels(), fontsize=fs)
-        ax.set_yticklabels(ax.get_yticklabels(), fontsize=fs-4)
-        ax.spines[['right', 'top']].set_visible(False)
-        if plot_legend:
-            plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), title="Forget Rate (%)")
-        plt.title(metric_to_plot + " on Llama-2-7B", fontsize=fs)
-        plt.tight_layout()
-        plt.savefig(f"barplots/{metric_to_plot}-Llama-2-7B{'legend' if plot_legend else ''}.pdf")
-        print(f"\includegraphics[width=\\textwidth]{{figures/barplots/{metric_to_plot}-Llama-2-7B{'legend' if plot_legend else ''}.pdf}}")
-        plt.close(fig)
-    for model in ["Llama-2-7B", "Phi"]:
-        sub_df = result[result["Model"] == model][["Method", "Forget Rate", "MAPO"]]
-        # print(sub_df.round(6).to_latex(index=False))
-        sub_df.reset_index(inplace=True)
-        # Reorienting the dataframe
-        sub_df_reoriented = sub_df.pivot(index="Method", columns='Forget Rate', values='MAPO')
-        # Output a latex table of the MAPO values by Method and Forget Rate
-        print(sub_df_reoriented.round(4).to_latex(index=True))

uploads.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from email.utils import parseaddr
+from huggingface_hub import HfApi
+import os
+import datetime
+OWNER="locuslab"
+SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
+RESULTS_DATASET = f"{OWNER}/results_public"
+LEADERBOARD_PATH = f"{OWNER}/tofu_leaderboard"
+api = HfApi()
+TOKEN = os.environ.get("TOKEN", None)
+YEAR_VERSION = "2024"
+def format_error(msg):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def add_new_eval(
+    val_or_test: str,
+    model: str,
+    model_family: str,
+    system_prompt: str,
+    url: str,
+    path_to_file: str,
+    organisation: str,
+    mail: str,
+):
+    # Very basic email parsing
+    _, parsed_mail = parseaddr(mail)
+    if not "@" in parsed_mail:
+        return format_warning("Please provide a valid email adress.")
+    print("Adding new eval")
+    # Check if the combination model/org already exists and prints a warning message if yes
+    # if model.lower() in set(eval_results[val_or_test]["model"]) and organisation.lower() in set(eval_results[val_or_test]["organisation"]):
+        # return format_warning("This model has been already submitted.")
+    if path_to_file is None:
+        return format_warning("Please attach a file.")
+    # Save submitted file
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=path_to_file.name,
+        path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # Compute score
+    # Save scored file
+    api.upload_file(
+        repo_id=SUBMISSION_DATASET,
+        path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
+        path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
+        repo_type="dataset",
+        token=TOKEN
+    )
+    # Actual submission
+    eval_entry = {
+        "model": model,
+        "model_family": model_family,
+        "system_prompt": system_prompt,
+        "url": url,
+        "organisation": organisation,
+        "mail": mail,
+        # "score": scores["all"]/num_questions["all"],
+        # "score_level1": scores[1]/num_questions[1],
+        # "score_level2": scores[2]/num_questions[2],
+        # "score_level3": scores[3]/num_questions[3],
+    }
+    # eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
+    # print(eval_results)
+    # eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
+    return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")