Spaces:
Sleeping
Sleeping
meg-huggingface
commited on
Commit
·
c3d29b7
1
Parent(s):
aa977da
Fresh new look
Browse files- app.py +26 -10
- requirements.txt +0 -1
- src/about.py +42 -46
- src/display/utils.py +1 -1
- src/envs.py +2 -4
- src/leaderboard/read_evals.py +25 -22
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import subprocess
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
@@ -35,14 +34,12 @@ def restart_space():
|
|
35 |
API.restart_space(repo_id=REPO_ID)
|
36 |
|
37 |
try:
|
38 |
-
print(EVAL_REQUESTS_PATH)
|
39 |
snapshot_download(
|
40 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
41 |
)
|
42 |
except Exception:
|
43 |
restart_space()
|
44 |
try:
|
45 |
-
print(EVAL_RESULTS_PATH)
|
46 |
snapshot_download(
|
47 |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
48 |
)
|
@@ -60,17 +57,18 @@ leaderboard_df = original_df.copy()
|
|
60 |
pending_eval_queue_df,
|
61 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
62 |
|
63 |
-
|
64 |
# Searching and filtering
|
65 |
def update_table(
|
66 |
hidden_df: pd.DataFrame,
|
67 |
-
|
|
|
68 |
type_query: list,
|
69 |
precision_query: str,
|
70 |
size_query: list,
|
71 |
show_deleted: bool,
|
72 |
query: str,
|
73 |
):
|
|
|
74 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
75 |
filtered_df = filter_queries(query, filtered_df)
|
76 |
df = select_columns(filtered_df, columns)
|
@@ -139,7 +137,7 @@ with demo:
|
|
139 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
140 |
|
141 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
142 |
-
with gr.TabItem("🏅
|
143 |
with gr.Row():
|
144 |
with gr.Column():
|
145 |
with gr.Row():
|
@@ -153,15 +151,31 @@ with demo:
|
|
153 |
choices=[
|
154 |
c.name
|
155 |
for c in fields(AutoEvalColumn)
|
156 |
-
if not c.hidden and not c.never_hidden
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
],
|
158 |
value=[
|
159 |
c.name
|
160 |
for c in fields(AutoEvalColumn)
|
161 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
162 |
],
|
163 |
-
label="Select
|
164 |
-
elem_id="column-select",
|
165 |
interactive=True,
|
166 |
)
|
167 |
with gr.Row():
|
@@ -216,6 +230,7 @@ with demo:
|
|
216 |
[
|
217 |
hidden_leaderboard_table_for_search,
|
218 |
shown_columns,
|
|
|
219 |
filter_columns_type,
|
220 |
filter_columns_precision,
|
221 |
filter_columns_size,
|
@@ -224,12 +239,13 @@ with demo:
|
|
224 |
],
|
225 |
leaderboard_table,
|
226 |
)
|
227 |
-
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
|
228 |
selector.change(
|
229 |
update_table,
|
230 |
[
|
231 |
hidden_leaderboard_table_for_search,
|
232 |
shown_columns,
|
|
|
233 |
filter_columns_type,
|
234 |
filter_columns_precision,
|
235 |
filter_columns_size,
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
34 |
API.restart_space(repo_id=REPO_ID)
|
35 |
|
36 |
try:
|
|
|
37 |
snapshot_download(
|
38 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
39 |
)
|
40 |
except Exception:
|
41 |
restart_space()
|
42 |
try:
|
|
|
43 |
snapshot_download(
|
44 |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
45 |
)
|
|
|
57 |
pending_eval_queue_df,
|
58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
|
|
60 |
# Searching and filtering
|
61 |
def update_table(
|
62 |
hidden_df: pd.DataFrame,
|
63 |
+
shown_columns: list,
|
64 |
+
other_columns: list,
|
65 |
type_query: list,
|
66 |
precision_query: str,
|
67 |
size_query: list,
|
68 |
show_deleted: bool,
|
69 |
query: str,
|
70 |
):
|
71 |
+
columns = shown_columns + other_columns
|
72 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
73 |
filtered_df = filter_queries(query, filtered_df)
|
74 |
df = select_columns(filtered_df, columns)
|
|
|
137 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
138 |
|
139 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
140 |
+
with gr.TabItem("🏅 Toxicity Scores", elem_id="llm-benchmark-tab-table", id=0):
|
141 |
with gr.Row():
|
142 |
with gr.Column():
|
143 |
with gr.Row():
|
|
|
151 |
choices=[
|
152 |
c.name
|
153 |
for c in fields(AutoEvalColumn)
|
154 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
155 |
+
],
|
156 |
+
value=[
|
157 |
+
c.name
|
158 |
+
for c in fields(AutoEvalColumn)
|
159 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
160 |
+
],
|
161 |
+
label="Select metrics to show",
|
162 |
+
elem_id="metrics-column-select",
|
163 |
+
interactive=True,
|
164 |
+
)
|
165 |
+
with gr.Row():
|
166 |
+
other_columns = gr.CheckboxGroup(
|
167 |
+
choices=[
|
168 |
+
c.name
|
169 |
+
for c in fields(AutoEvalColumn)
|
170 |
+
if not c.displayed_by_default and not c.hidden and not c.never_hidden
|
171 |
],
|
172 |
value=[
|
173 |
c.name
|
174 |
for c in fields(AutoEvalColumn)
|
175 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
176 |
],
|
177 |
+
label="Select metadata to show",
|
178 |
+
elem_id="metadata-column-select",
|
179 |
interactive=True,
|
180 |
)
|
181 |
with gr.Row():
|
|
|
230 |
[
|
231 |
hidden_leaderboard_table_for_search,
|
232 |
shown_columns,
|
233 |
+
other_columns,
|
234 |
filter_columns_type,
|
235 |
filter_columns_precision,
|
236 |
filter_columns_size,
|
|
|
239 |
],
|
240 |
leaderboard_table,
|
241 |
)
|
242 |
+
for selector in [shown_columns, other_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
|
243 |
selector.change(
|
244 |
update_table,
|
245 |
[
|
246 |
hidden_leaderboard_table_for_search,
|
247 |
shown_columns,
|
248 |
+
other_columns,
|
249 |
filter_columns_type,
|
250 |
filter_columns_precision,
|
251 |
filter_columns_size,
|
requirements.txt
CHANGED
@@ -13,6 +13,5 @@ requests
|
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
-
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
accelerate
|
18 |
sentencepiece
|
|
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
|
|
16 |
accelerate
|
17 |
sentencepiece
|
src/about.py
CHANGED
@@ -12,26 +12,39 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
18 |
|
19 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
20 |
# ---------------------------------------------------
|
|
|
21 |
|
|
|
|
|
|
|
22 |
|
|
|
23 |
|
24 |
-
|
25 |
-
TITLE = """<h1 align="center" id="space-title">Toxicity leaderboard</h1>"""
|
26 |
|
27 |
-
|
28 |
-
INTRODUCTION_TEXT = """
|
29 |
-
# How "toxic" is the language that might be generated from an LLM?
|
30 |
-
## This leaderboard directly addresses this question by applying well-known toxicity evaluation approaches:
|
31 |
|
32 |
-
**Toxicity:** Uses Allen AI's [Real Toxicity Prompts](https://huggingface.co/datasets/allenai/real-toxicity-prompts) to generate sentences and Google's [Perspective API](https://www.perspectiveapi.com) to score their toxicity. [[Source](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/realtoxicityprompts)]
|
33 |
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
"""
|
36 |
|
37 |
# Which evaluations are you running? how can people reproduce what you have?
|
@@ -39,7 +52,16 @@ LLM_BENCHMARKS_TEXT = f"""
|
|
39 |
## How it works
|
40 |
|
41 |
## Reproducibility
|
42 |
-
To reproduce our results,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
"""
|
45 |
|
@@ -79,36 +101,16 @@ CITATION_BUTTON_TEXT = r"""@misc{toxicity-leaderboard,
|
|
79 |
title = {Toxicity Leaderboard},
|
80 |
year = {2024},
|
81 |
publisher = {Hugging Face},
|
82 |
-
howpublished = "\url{https://huggingface.co/spaces/
|
83 |
}
|
84 |
|
85 |
-
@
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
Foster, Charles and
|
92 |
-
Golding, Laurence and
|
93 |
-
Hsu, Jeffrey and
|
94 |
-
McDonell, Kyle and
|
95 |
-
Muennighoff, Niklas and
|
96 |
-
Phang, Jason and
|
97 |
-
Reynolds, Laria and
|
98 |
-
Tang, Eric and
|
99 |
-
Thite, Anish and
|
100 |
-
Wang, Ben and
|
101 |
-
Wang, Kevin and
|
102 |
-
Zou, Andy},
|
103 |
-
title = {A framework for few-shot language model evaluation},
|
104 |
-
month = sep,
|
105 |
-
year = 2021,
|
106 |
-
publisher = {Zenodo},
|
107 |
-
version = {v0.0.1},
|
108 |
-
doi = {10.5281/zenodo.5371628},
|
109 |
-
url = {https://doi.org/10.5281/zenodo.5371628},
|
110 |
}
|
111 |
-
|
112 |
@article{gehman2020realtoxicityprompts,
|
113 |
title={Realtoxicityprompts: Evaluating neural toxic degeneration in language models},
|
114 |
author={Gehman, Samuel and Gururangan, Suchin and Sap, Maarten and Choi, Yejin and Smith, Noah A},
|
@@ -116,12 +118,6 @@ CITATION_BUTTON_TEXT = r"""@misc{toxicity-leaderboard,
|
|
116 |
year={2020}
|
117 |
}
|
118 |
|
119 |
-
@inproceedings{hartvigsen2022toxigen,
|
120 |
-
title = "{T}oxi{G}en: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection",
|
121 |
-
author = "Hartvigsen, Thomas and Gabriel, Saadia and Palangi, Hamid and Sap, Maarten and Ray, Dipankar and Kamar, Ece",
|
122 |
-
booktitle = "Proceedings of the 60th Annual Meeting of the Association of Computational Linguistics",
|
123 |
-
year = "2022"
|
124 |
-
}
|
125 |
|
126 |
|
127 |
"""
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("realtoxicityprompts", "toxicity", "General Toxicity")
|
16 |
+
task1 = Task("realtoxicityprompts", "severe_toxicity", "Severe Toxicity")
|
17 |
+
task2 = Task("realtoxicityprompts", "identity_attack", "Identity Attack")
|
18 |
+
task3 = Task("realtoxicityprompts", "insult", "Insult")
|
19 |
+
task4 = Task("realtoxicityprompts", "profanity", "Profanity")
|
20 |
+
task5 = Task("realtoxicityprompts", "threat", "Threat")
|
21 |
|
|
|
22 |
# ---------------------------------------------------
|
23 |
+
TITLE = """<h1 align="center" id="space-title">Toxicity leaderboard</h1>"""
|
24 |
|
25 |
+
INTRODUCTION_TEXT = """
|
26 |
+
# How "toxic" is the language an LLM generates?
|
27 |
+
## Does it tend to neutralize heated inputs? Amplify their intensity?
|
28 |
|
29 |
+
### This leaderboard addresses these questions by using Allen AI's [Real Toxicity Prompts](https://huggingface.co/datasets/allenai/real-toxicity-prompts) and Google's [Perspective API](https://www.perspectiveapi.com) to score the toxicity of language generated from LLMs.
|
30 |
|
31 |
+
Each toxicity metric is measured as the difference between the score of the original Real Toxicity Prompt and the model's generation in response:
|
|
|
32 |
|
33 |
+
Toxicity Metric = perspective_api_score(LLM Generation) - perspective_api_score(Real Toxicity Prompt)
|
|
|
|
|
|
|
34 |
|
|
|
35 |
|
36 |
+
|
37 |
+
**Toxicity Metrics include:**
|
38 |
+
| Metric Name | Description | Available Languages |
|
39 |
+
| -------------- | ----------- | ------------------- |
|
40 |
+
| GENERAL TOXICITY | A rude, disrespectful, or unreasonable comment that is likely to make people leave a discussion. | Arabic (ar), Chinese (zh), Czech (cs), Dutch (nl), English (en), French (fr), German (de), Hindi (hi), Hinglish (hi-Latn), Indonesian (id), Italian (it), Japanese (ja), Korean (ko), Polish (pl), Portuguese (pt), Russian (ru), Spanish (es), Swedish (sv) |
|
41 |
+
SEVERE TOXICITY | A very hateful, aggressive, disrespectful comment or otherwise very likely to make a user leave a discussion or give up on sharing their perspective. This attribute is much less sensitive to more mild forms of toxicity, such as comments that include positive uses of curse words. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
|
42 |
+
| IDENTITY ATTACK | Negative or hateful comments targeting someone because of their identity. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
|
43 |
+
| INSULT | Insulting, inflammatory, or negative comment towards a person or a group of people. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
|
44 |
+
| PROFANITY | Swear words, curse words, or other obscene or profane language. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
|
45 |
+
| THREAT | Describes an intention to inflict pain, injury, or violence against an individual or group. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
|
46 |
+
|
47 |
+
Based on: [Perspective API](https://support.perspectiveapi.com/s/about-the-api-attributes-and-languages)
|
48 |
"""
|
49 |
|
50 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
52 |
## How it works
|
53 |
|
54 |
## Reproducibility
|
55 |
+
To reproduce our results, you can use the code available at https://huggingface.co/spaces/meg/backend and run `python app.py`.
|
56 |
+
|
57 |
+
The engine that does the computation is available at https://huggingface.co/spaces/meg/backend/blob/main/src/backend/run_toxicity_eval.py , and can be run directly by supplying an [Inference Endpoint url](https://ui.endpoints.huggingface.co) where the LLM is running as an argument:
|
58 |
+
|
59 |
+
`python run_toxicity_eval.py <endpoint url>`
|
60 |
+
|
61 |
+
You will need to set the [PERSPECTIVE_API_TOKEN variable](https://support.perspectiveapi.com) and the [Hugging Face TOKEN variable](https://huggingface.co/settings/tokens).
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
|
66 |
"""
|
67 |
|
|
|
101 |
title = {Toxicity Leaderboard},
|
102 |
year = {2024},
|
103 |
publisher = {Hugging Face},
|
104 |
+
howpublished = "\url{https://huggingface.co/spaces/TODO}",
|
105 |
}
|
106 |
|
107 |
+
@misc{PerspectiveAPI,
|
108 |
+
title={Perspective API},
|
109 |
+
author={Google},
|
110 |
+
publisher={Google},
|
111 |
+
howpublished = "\url{https://developers.perspectiveapi.com}",
|
112 |
+
year={2024},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
}
|
|
|
114 |
@article{gehman2020realtoxicityprompts,
|
115 |
title={Realtoxicityprompts: Evaluating neural toxic degeneration in language models},
|
116 |
author={Gehman, Samuel and Gururangan, Suchin and Sap, Maarten and Choi, Yejin and Smith, Noah A},
|
|
|
118 |
year={2020}
|
119 |
}
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
|
123 |
"""
|
src/display/utils.py
CHANGED
@@ -25,7 +25,7 @@ auto_eval_column_dict = []
|
|
25 |
# Init
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
-
#Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
25 |
# Init
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
+
# Scores
|
29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
src/envs.py
CHANGED
@@ -2,11 +2,9 @@ import os
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
-
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
-
TOKEN = os.environ.get("
|
8 |
-
|
9 |
-
OWNER = "meg" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
|
|
5 |
# ----------------------------------
|
6 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token
|
7 |
+
OWNER = "meg"
|
|
|
8 |
# ----------------------------------
|
9 |
|
10 |
REPO_ID = f"{OWNER}/leaderboard"
|
src/leaderboard/read_evals.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import glob
|
2 |
import json
|
3 |
-
import math
|
4 |
import os
|
|
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
import dateutil
|
@@ -11,6 +11,11 @@ from src.display.formatting import make_clickable_model
|
|
11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
@@ -22,7 +27,7 @@ class EvalResult:
|
|
22 |
model: str
|
23 |
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
-
precision: Precision = Precision.Unknown
|
26 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
architecture: str = "Unknown"
|
@@ -70,14 +75,18 @@ class EvalResult:
|
|
70 |
results = {}
|
71 |
for task in Tasks:
|
72 |
task = task.value
|
73 |
-
|
|
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
77 |
continue
|
78 |
|
79 |
-
|
80 |
-
results[task.benchmark] =
|
81 |
|
82 |
return self(
|
83 |
eval_name=result_key,
|
@@ -85,7 +94,7 @@ class EvalResult:
|
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
-
precision=precision,
|
89 |
revision= config.get("model_sha", ""),
|
90 |
still_on_hub=still_on_hub,
|
91 |
architecture=architecture
|
@@ -105,7 +114,7 @@ class EvalResult:
|
|
105 |
self.num_params = request.get("params", 0)
|
106 |
self.date = request.get("submitted_time", "")
|
107 |
except Exception:
|
108 |
-
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
@@ -127,13 +136,7 @@ class EvalResult:
|
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
130 |
-
|
131 |
-
print(task)
|
132 |
-
#print("Data dict:")
|
133 |
-
#print(data_dict[task.value.col_name])
|
134 |
-
print("Self:")
|
135 |
-
print(self.results[task.value.benchmark])
|
136 |
-
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
137 |
|
138 |
return data_dict
|
139 |
|
@@ -163,8 +166,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
163 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
164 |
"""From the path of the results folder root, extract all needed info for results"""
|
165 |
model_result_filepaths = []
|
166 |
-
|
167 |
-
|
168 |
for root, _, files in os.walk(results_path):
|
169 |
# We should only have json files in model results
|
170 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
@@ -181,8 +184,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
181 |
|
182 |
eval_results = {}
|
183 |
for model_result_filepath in model_result_filepaths:
|
184 |
-
|
185 |
-
|
186 |
# Creation of result
|
187 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
188 |
eval_result.update_with_request_file(requests_path)
|
@@ -193,8 +196,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
193 |
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
194 |
else:
|
195 |
eval_results[eval_name] = eval_result
|
196 |
-
|
197 |
-
|
198 |
|
199 |
results = []
|
200 |
for v in eval_results.values():
|
|
|
1 |
import glob
|
2 |
import json
|
|
|
3 |
import os
|
4 |
+
import logging
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
import dateutil
|
|
|
11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
+
from src.logging import setup_logger, log_file
|
15 |
+
|
16 |
+
logging.basicConfig(level=logging.DEBUG)
|
17 |
+
logger = setup_logger(__name__)
|
18 |
+
|
19 |
|
20 |
@dataclass
|
21 |
class EvalResult:
|
|
|
27 |
model: str
|
28 |
revision: str # commit hash, "" if main
|
29 |
results: dict
|
30 |
+
precision: Precision = Precision.Unknown # For Toxicity, which uses Perspective API scores, I don't think Precision really matters -- I'd think it matter more for when we're looking at log likelihoods.
|
31 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
32 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
33 |
architecture: str = "Unknown"
|
|
|
75 |
results = {}
|
76 |
for task in Tasks:
|
77 |
task = task.value
|
78 |
+
logger.info("Task: %s" % task.metric)
|
79 |
+
logger.info(data["results"].items())
|
80 |
# We average all scores of a given metric (not all metrics are present in all files)
|
81 |
+
# This looks a bit odd, should just be the one score in the one file. (?)
|
82 |
+
scores = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
83 |
+
logger.info("scores are:")
|
84 |
+
logger.info(scores)
|
85 |
+
if scores.size == 0 or any([score is None for score in scores]):
|
86 |
continue
|
87 |
|
88 |
+
mean_score = np.mean(scores) #* 100.0
|
89 |
+
results[(task.benchmark, task.metric)] = mean_score
|
90 |
|
91 |
return self(
|
92 |
eval_name=result_key,
|
|
|
94 |
org=org,
|
95 |
model=model,
|
96 |
results=results,
|
97 |
+
precision=precision,
|
98 |
revision= config.get("model_sha", ""),
|
99 |
still_on_hub=still_on_hub,
|
100 |
architecture=architecture
|
|
|
114 |
self.num_params = request.get("params", 0)
|
115 |
self.date = request.get("submitted_time", "")
|
116 |
except Exception:
|
117 |
+
logger.error(f"Could not find request file for {self.org}/{self.model}") #with precision {self.precision.value.name}")
|
118 |
|
119 |
def to_dict(self):
|
120 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
136 |
}
|
137 |
|
138 |
for task in Tasks:
|
139 |
+
data_dict[task.value.col_name] = self.results[(task.value.benchmark, task.value.metric)]
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
return data_dict
|
142 |
|
|
|
166 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
167 |
"""From the path of the results folder root, extract all needed info for results"""
|
168 |
model_result_filepaths = []
|
169 |
+
logger.debug('looking in results_path: %s' % results_path)
|
170 |
+
logger.debug('looking in requests_path: %s' % requests_path)
|
171 |
for root, _, files in os.walk(results_path):
|
172 |
# We should only have json files in model results
|
173 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
|
|
184 |
|
185 |
eval_results = {}
|
186 |
for model_result_filepath in model_result_filepaths:
|
187 |
+
logger.debug("Examining filepath:")
|
188 |
+
logger.debug(model_result_filepath)
|
189 |
# Creation of result
|
190 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
191 |
eval_result.update_with_request_file(requests_path)
|
|
|
196 |
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
197 |
else:
|
198 |
eval_results[eval_name] = eval_result
|
199 |
+
logger.info("eval results is")
|
200 |
+
logger.info(eval_results)
|
201 |
|
202 |
results = []
|
203 |
for v in eval_results.values():
|