“WadoodAbdul”
commited on
Commit
·
eb6e73c
1
Parent(s):
7d35d7a
added m2_types and updated documentation
Browse files- app.py +25 -100
- assets/image.png +0 -0
- src/about.py +30 -37
- src/display/css_html_js.py +7 -0
- src/display/utils.py +10 -3
- src/leaderboard/read_evals.py +79 -33
- src/populate.py +3 -2
app.py
CHANGED
@@ -12,11 +12,14 @@ from src.about import (
|
|
12 |
INTRODUCTION_TEXT,
|
13 |
LLM_BENCHMARKS_TEXT,
|
14 |
TITLE,
|
|
|
15 |
)
|
16 |
from src.display.css_html_js import custom_css
|
17 |
from src.display.utils import (
|
18 |
-
|
19 |
-
|
|
|
|
|
20 |
EVAL_COLS,
|
21 |
EVAL_TYPES,
|
22 |
NUMERIC_INTERVALS,
|
@@ -52,8 +55,11 @@ except Exception:
|
|
52 |
restart_space()
|
53 |
|
54 |
|
55 |
-
raw_data,
|
56 |
-
|
|
|
|
|
|
|
57 |
|
58 |
(
|
59 |
finished_eval_queue_df,
|
@@ -74,7 +80,7 @@ def update_table(
|
|
74 |
):
|
75 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
76 |
filtered_df = filter_queries(query, filtered_df)
|
77 |
-
df = select_columns(filtered_df, columns)
|
78 |
return df
|
79 |
|
80 |
|
@@ -82,13 +88,13 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
82 |
return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
|
83 |
|
84 |
|
85 |
-
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
86 |
always_here_cols = [
|
87 |
AutoEvalColumn.model_type_symbol.name,
|
88 |
AutoEvalColumn.model.name,
|
89 |
]
|
90 |
# We use COLS to maintain sorting
|
91 |
-
filtered_df = df[always_here_cols + [c for c in
|
92 |
return filtered_df
|
93 |
|
94 |
|
@@ -146,6 +152,7 @@ def filter_models(
|
|
146 |
demo = gr.Blocks(css=custom_css)
|
147 |
with demo:
|
148 |
gr.HTML(TITLE)
|
|
|
149 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
150 |
|
151 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
@@ -160,11 +167,11 @@ with demo:
|
|
160 |
)
|
161 |
with gr.Row():
|
162 |
shown_columns = gr.CheckboxGroup(
|
163 |
-
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
|
164 |
value=[
|
165 |
c.name
|
166 |
for c in fields(AutoEvalColumn)
|
167 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
168 |
],
|
169 |
label="Select columns to show",
|
170 |
elem_id="column-select",
|
@@ -197,9 +204,8 @@ with demo:
|
|
197 |
# interactive=True,
|
198 |
# elem_id="filter-columns-size",
|
199 |
# )
|
200 |
-
|
201 |
leaderboard_table = gr.components.Dataframe(
|
202 |
-
value=
|
203 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
204 |
datatype=TYPES,
|
205 |
elem_id="leaderboard-table",
|
@@ -209,8 +215,8 @@ with demo:
|
|
209 |
|
210 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
211 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
212 |
-
value=
|
213 |
-
headers=
|
214 |
datatype=TYPES,
|
215 |
visible=False,
|
216 |
)
|
@@ -254,11 +260,11 @@ with demo:
|
|
254 |
)
|
255 |
with gr.Row():
|
256 |
shown_columns = gr.CheckboxGroup(
|
257 |
-
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden],
|
258 |
value=[
|
259 |
c.name
|
260 |
for c in fields(AutoEvalColumn)
|
261 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
262 |
],
|
263 |
label="Select columns to show",
|
264 |
elem_id="column-select",
|
@@ -293,7 +299,7 @@ with demo:
|
|
293 |
# )
|
294 |
|
295 |
leaderboard_table = gr.components.Dataframe(
|
296 |
-
value=
|
297 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
298 |
datatype=TYPES,
|
299 |
elem_id="leaderboard-table",
|
@@ -303,8 +309,8 @@ with demo:
|
|
303 |
|
304 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
305 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
306 |
-
value=
|
307 |
-
headers=
|
308 |
datatype=TYPES,
|
309 |
visible=False,
|
310 |
)
|
@@ -345,87 +351,6 @@ with demo:
|
|
345 |
with gr.Row():
|
346 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
347 |
|
348 |
-
with gr.Column():
|
349 |
-
with gr.Accordion(
|
350 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
351 |
-
open=False,
|
352 |
-
):
|
353 |
-
with gr.Row():
|
354 |
-
finished_eval_table = gr.components.Dataframe(
|
355 |
-
value=finished_eval_queue_df,
|
356 |
-
headers=EVAL_COLS,
|
357 |
-
datatype=EVAL_TYPES,
|
358 |
-
row_count=5,
|
359 |
-
)
|
360 |
-
with gr.Accordion(
|
361 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
362 |
-
open=False,
|
363 |
-
):
|
364 |
-
with gr.Row():
|
365 |
-
running_eval_table = gr.components.Dataframe(
|
366 |
-
value=running_eval_queue_df,
|
367 |
-
headers=EVAL_COLS,
|
368 |
-
datatype=EVAL_TYPES,
|
369 |
-
row_count=5,
|
370 |
-
)
|
371 |
-
|
372 |
-
with gr.Accordion(
|
373 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
374 |
-
open=False,
|
375 |
-
):
|
376 |
-
with gr.Row():
|
377 |
-
pending_eval_table = gr.components.Dataframe(
|
378 |
-
value=pending_eval_queue_df,
|
379 |
-
headers=EVAL_COLS,
|
380 |
-
datatype=EVAL_TYPES,
|
381 |
-
row_count=5,
|
382 |
-
)
|
383 |
-
with gr.Row():
|
384 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
385 |
-
|
386 |
-
with gr.Row():
|
387 |
-
with gr.Column():
|
388 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
389 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
390 |
-
model_type = gr.Dropdown(
|
391 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
392 |
-
label="Model type",
|
393 |
-
multiselect=False,
|
394 |
-
value=None,
|
395 |
-
interactive=True,
|
396 |
-
)
|
397 |
-
|
398 |
-
with gr.Column():
|
399 |
-
precision = gr.Dropdown(
|
400 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
401 |
-
label="Precision",
|
402 |
-
multiselect=False,
|
403 |
-
value="float16",
|
404 |
-
interactive=True,
|
405 |
-
)
|
406 |
-
weight_type = gr.Dropdown(
|
407 |
-
choices=[i.value.name for i in WeightType],
|
408 |
-
label="Weights type",
|
409 |
-
multiselect=False,
|
410 |
-
value="Original",
|
411 |
-
interactive=True,
|
412 |
-
)
|
413 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
414 |
-
|
415 |
-
submit_button = gr.Button("Submit Eval")
|
416 |
-
submission_result = gr.Markdown()
|
417 |
-
submit_button.click(
|
418 |
-
add_new_eval,
|
419 |
-
[
|
420 |
-
model_name_textbox,
|
421 |
-
base_model_name_textbox,
|
422 |
-
revision_name_textbox,
|
423 |
-
precision,
|
424 |
-
weight_type,
|
425 |
-
model_type,
|
426 |
-
],
|
427 |
-
submission_result,
|
428 |
-
)
|
429 |
|
430 |
with gr.Row():
|
431 |
with gr.Accordion("📙 Citation", open=False):
|
@@ -440,4 +365,4 @@ with demo:
|
|
440 |
scheduler = BackgroundScheduler()
|
441 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
442 |
scheduler.start()
|
443 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
12 |
INTRODUCTION_TEXT,
|
13 |
LLM_BENCHMARKS_TEXT,
|
14 |
TITLE,
|
15 |
+
LOGO
|
16 |
)
|
17 |
from src.display.css_html_js import custom_css
|
18 |
from src.display.utils import (
|
19 |
+
DATASET_BENCHMARK_COLS,
|
20 |
+
TYPES_BENCHMARK_COLS,
|
21 |
+
DATASET_COLS,
|
22 |
+
M2_TYPES_COLS,
|
23 |
EVAL_COLS,
|
24 |
EVAL_TYPES,
|
25 |
NUMERIC_INTERVALS,
|
|
|
55 |
restart_space()
|
56 |
|
57 |
|
58 |
+
raw_data, datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "datasets")
|
59 |
+
datasets_leaderboard_df = datasets_original_df.copy()
|
60 |
+
|
61 |
+
raw_data, types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, M2_TYPES_COLS, TYPES_BENCHMARK_COLS, "m2_types")
|
62 |
+
types_leaderboard_df = types_original_df.copy()
|
63 |
|
64 |
(
|
65 |
finished_eval_queue_df,
|
|
|
80 |
):
|
81 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
82 |
filtered_df = filter_queries(query, filtered_df)
|
83 |
+
df = select_columns(filtered_df, columns, list(hidden_df.columns))
|
84 |
return df
|
85 |
|
86 |
|
|
|
88 |
return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
|
89 |
|
90 |
|
91 |
+
def select_columns(df: pd.DataFrame, columns: list, cols:list) -> pd.DataFrame:
|
92 |
always_here_cols = [
|
93 |
AutoEvalColumn.model_type_symbol.name,
|
94 |
AutoEvalColumn.model.name,
|
95 |
]
|
96 |
# We use COLS to maintain sorting
|
97 |
+
filtered_df = df[always_here_cols + [c for c in cols if c in df.columns and c in columns]]
|
98 |
return filtered_df
|
99 |
|
100 |
|
|
|
152 |
demo = gr.Blocks(css=custom_css)
|
153 |
with demo:
|
154 |
gr.HTML(TITLE)
|
155 |
+
gr.HTML(LOGO, elem_classes="logo")
|
156 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
157 |
|
158 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
167 |
)
|
168 |
with gr.Row():
|
169 |
shown_columns = gr.CheckboxGroup(
|
170 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.m2_type_col],
|
171 |
value=[
|
172 |
c.name
|
173 |
for c in fields(AutoEvalColumn)
|
174 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.m2_type_col
|
175 |
],
|
176 |
label="Select columns to show",
|
177 |
elem_id="column-select",
|
|
|
204 |
# interactive=True,
|
205 |
# elem_id="filter-columns-size",
|
206 |
# )
|
|
|
207 |
leaderboard_table = gr.components.Dataframe(
|
208 |
+
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
209 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
210 |
datatype=TYPES,
|
211 |
elem_id="leaderboard-table",
|
|
|
215 |
|
216 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
217 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
218 |
+
value=datasets_original_df[DATASET_COLS],
|
219 |
+
headers=DATASET_COLS,
|
220 |
datatype=TYPES,
|
221 |
visible=False,
|
222 |
)
|
|
|
260 |
)
|
261 |
with gr.Row():
|
262 |
shown_columns = gr.CheckboxGroup(
|
263 |
+
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and not c.dataset_task_col],
|
264 |
value=[
|
265 |
c.name
|
266 |
for c in fields(AutoEvalColumn)
|
267 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden and not c.dataset_task_col
|
268 |
],
|
269 |
label="Select columns to show",
|
270 |
elem_id="column-select",
|
|
|
299 |
# )
|
300 |
|
301 |
leaderboard_table = gr.components.Dataframe(
|
302 |
+
value=types_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
303 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
304 |
datatype=TYPES,
|
305 |
elem_id="leaderboard-table",
|
|
|
309 |
|
310 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
311 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
312 |
+
value=types_original_df[M2_TYPES_COLS],
|
313 |
+
headers=M2_TYPES_COLS,
|
314 |
datatype=TYPES,
|
315 |
visible=False,
|
316 |
)
|
|
|
351 |
with gr.Row():
|
352 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
353 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
|
355 |
with gr.Row():
|
356 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
365 |
scheduler = BackgroundScheduler()
|
366 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
367 |
scheduler.start()
|
368 |
+
demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
|
assets/image.png
ADDED
src/about.py
CHANGED
@@ -7,8 +7,7 @@ class Task:
|
|
7 |
benchmark: str
|
8 |
metric: str
|
9 |
col_name: str
|
10 |
-
|
11 |
-
|
12 |
|
13 |
# Select your tasks here
|
14 |
# ---------------------------------------------------
|
@@ -23,6 +22,21 @@ class Tasks(Enum):
|
|
23 |
# task5 = Task("", "f1", "")
|
24 |
# task6 = Task("", "f1", "")
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
NUM_FEWSHOT = 0 # Change with your few shot
|
28 |
# ---------------------------------------------------
|
@@ -30,28 +44,33 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
30 |
|
31 |
# Your leaderboard name
|
32 |
TITLE = """<h1 align="center" id="space-title">MEDICS NER Leaderboard</h1>"""
|
33 |
-
|
34 |
# What does your leaderboard evaluate?
|
35 |
INTRODUCTION_TEXT = """
|
|
|
|
|
|
|
36 |
"""
|
37 |
|
38 |
# Which evaluations are you running? how can people reproduce what you have?
|
39 |
LLM_BENCHMARKS_TEXT = f"""
|
|
|
|
|
|
|
40 |
## About
|
41 |
-
Named Entity
|
42 |
|
43 |
-
|
44 |
-
To keep the evaluation widely relevant the entity types in the dataset are mapped to broader M2 types. More information on this mapping can be found here - M2-DATASETS-ARTICLE-LINK
|
45 |
|
46 |
-
###
|
47 |
-
📈 We evaluate models on
|
48 |
-
- NCBI
|
49 |
- CHIA
|
50 |
- BIORED
|
51 |
- BC5CD
|
52 |
|
53 |
### Evaluation Metrics
|
54 |
-
|
55 |
|
56 |
|
57 |
## Reproducibility
|
@@ -60,33 +79,7 @@ To reproduce our results, here is the commands you can run:
|
|
60 |
"""
|
61 |
|
62 |
EVALUATION_QUEUE_TEXT = """
|
63 |
-
|
64 |
-
|
65 |
-
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
66 |
-
```python
|
67 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
68 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
69 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
70 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
71 |
-
```
|
72 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
73 |
-
|
74 |
-
Note: make sure your model is public!
|
75 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
76 |
-
|
77 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
78 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
79 |
-
|
80 |
-
### 3) Make sure your model has an open license!
|
81 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
82 |
-
|
83 |
-
### 4) Fill up your model card
|
84 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
85 |
-
|
86 |
-
## In case of model failure
|
87 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
88 |
-
Make sure you have followed the above steps first.
|
89 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
90 |
"""
|
91 |
|
92 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
7 |
benchmark: str
|
8 |
metric: str
|
9 |
col_name: str
|
10 |
+
|
|
|
11 |
|
12 |
# Select your tasks here
|
13 |
# ---------------------------------------------------
|
|
|
22 |
# task5 = Task("", "f1", "")
|
23 |
# task6 = Task("", "f1", "")
|
24 |
|
25 |
+
@dataclass
|
26 |
+
class M2Type:
|
27 |
+
benchmark: str
|
28 |
+
metric: str
|
29 |
+
col_name: str
|
30 |
+
|
31 |
+
class M2Types(Enum):
|
32 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
33 |
+
type0 = M2Type("condition", "f1", "CONDITION")
|
34 |
+
type1 = M2Type("measurement", "f1", "MEASUREMENT")
|
35 |
+
type2 = M2Type("drug", "f1", "DRUG")
|
36 |
+
type3 = M2Type("procedure", "f1", "PROCEDURE")
|
37 |
+
type4 = M2Type("gene", "f1", "GENE")
|
38 |
+
type5 = M2Type("gene variant", "f1", "GENE VARIANT")
|
39 |
+
|
40 |
|
41 |
NUM_FEWSHOT = 0 # Change with your few shot
|
42 |
# ---------------------------------------------------
|
|
|
44 |
|
45 |
# Your leaderboard name
|
46 |
TITLE = """<h1 align="center" id="space-title">MEDICS NER Leaderboard</h1>"""
|
47 |
+
LOGO = """<img src="file/assets/image.png" alt="M2 X HF" width="500" height="333">"""
|
48 |
# What does your leaderboard evaluate?
|
49 |
INTRODUCTION_TEXT = """
|
50 |
+
Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as it is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
|
51 |
+
The datasets used for this evaluation encompass a wide range of medical entities, including diseases, symptoms, medications, procedures and anatomical terms. These datasets are sourced from openly available clinical data (including annotations) to ensure comprehensive coverage and reflect the complexity of real-world medical language. More details about the datasets included can be found in the "About" section.
|
52 |
+
The evaluation metrics used in this leaderboard focus primarily on the F1-score, a widely recognized measure of a model's accuracy. More details about the evaluation metric can be found in the "About" section
|
53 |
"""
|
54 |
|
55 |
# Which evaluations are you running? how can people reproduce what you have?
|
56 |
LLM_BENCHMARKS_TEXT = f"""
|
57 |
+
|
58 |
+
Note: It is important to note that the purpose of this evaluation is purely academic and exploratory. The models assessed here have not been approved for clinical use, and their results should not be interpreted as clinically validated. The leaderboard serves as a platform for researchers to compare models, understand their strengths and limitations, and drive further advancements in the field of clinical NLP.
|
59 |
+
|
60 |
## About
|
61 |
+
The Named Clinical Entity Recognition Leaderboard is aimed at advancing the field of natural language processing in healthcare. It provides a standardized platform for evaluating and comparing the performance of various language models in recognizing named clinical entities, a critical task for applications such as clinical documentation, decision support, and information extraction. By fostering transparency and facilitating benchmarking, the leaderboard's goal is to drive innovation and improvement in NLP models. It also helps researchers identify the strengths and weaknesses of different approaches, ultimately contributing to the development of more accurate and reliable tools for clinical use. Despite its exploratory nature, the leaderboard aims to play a role in guiding research and ensuring that advancements are grounded in rigorous and comprehensive evaluations.
|
62 |
|
63 |
+
## How it works
|
|
|
64 |
|
65 |
+
### Datasets
|
66 |
+
📈 We evaluate the models on 4 datasets, encompassing 6 entity types
|
67 |
+
- NCBI
|
68 |
- CHIA
|
69 |
- BIORED
|
70 |
- BC5CD
|
71 |
|
72 |
### Evaluation Metrics
|
73 |
+
We perceive NER objects as span(with character offsets) instead of token level artifacts. This enables us to expand to nested NER scenarios easily.
|
74 |
|
75 |
|
76 |
## Reproducibility
|
|
|
79 |
"""
|
80 |
|
81 |
EVALUATION_QUEUE_TEXT = """
|
82 |
+
Follow the steps detailed in the [medics_ner](https://github.com/WadoodAbdul/medics_ner/blob/3b415e9c4c9561ce5168374813072bde36658ff4/docs/submit_to_leaderboard.md) repo to upload you model to the leaderoard.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
"""
|
84 |
|
85 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/display/css_html_js.py
CHANGED
@@ -1,4 +1,11 @@
|
|
1 |
custom_css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
.markdown-text {
|
4 |
font-size: 16px !important;
|
|
|
1 |
custom_css = """
|
2 |
+
.logo {
|
3 |
+
width: 500px;
|
4 |
+
height: auto;
|
5 |
+
margin: 0 auto;
|
6 |
+
max-width: 100%
|
7 |
+
object-fit: contain;
|
8 |
+
}
|
9 |
|
10 |
.markdown-text {
|
11 |
font-size: 16px !important;
|
src/display/utils.py
CHANGED
@@ -4,6 +4,7 @@ from enum import Enum
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.about import Tasks
|
|
|
7 |
|
8 |
|
9 |
def fields(raw_class):
|
@@ -20,6 +21,8 @@ class ColumnContent:
|
|
20 |
displayed_by_default: bool
|
21 |
hidden: bool = False
|
22 |
never_hidden: bool = False
|
|
|
|
|
23 |
|
24 |
|
25 |
## Leaderboard columns
|
@@ -30,7 +33,9 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
30 |
# Scores
|
31 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
32 |
for task in Tasks:
|
33 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False)])
|
|
|
|
|
34 |
# Model information
|
35 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
36 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
@@ -128,7 +133,8 @@ class Precision(Enum):
|
|
128 |
|
129 |
|
130 |
# Column selection
|
131 |
-
|
|
|
132 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
133 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
134 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
@@ -136,7 +142,8 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
136 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
137 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
138 |
|
139 |
-
|
|
|
140 |
|
141 |
NUMERIC_INTERVALS = {
|
142 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.about import Tasks
|
7 |
+
from src.about import M2Types
|
8 |
|
9 |
|
10 |
def fields(raw_class):
|
|
|
21 |
displayed_by_default: bool
|
22 |
hidden: bool = False
|
23 |
never_hidden: bool = False
|
24 |
+
dataset_task_col: bool = False
|
25 |
+
m2_type_col: bool = False
|
26 |
|
27 |
|
28 |
## Leaderboard columns
|
|
|
33 |
# Scores
|
34 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True)])
|
35 |
for task in Tasks:
|
36 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True)])
|
37 |
+
for task in M2Types:
|
38 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, m2_type_col=True)])
|
39 |
# Model information
|
40 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
41 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
133 |
|
134 |
|
135 |
# Column selection
|
136 |
+
DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.m2_type_col]
|
137 |
+
M2_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
|
138 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
139 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
140 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
|
|
142 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
143 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
144 |
|
145 |
+
DATASET_BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
146 |
+
TYPES_BENCHMARK_COLS = [t.value.col_name for t in M2Types]
|
147 |
|
148 |
NUMERIC_INTERVALS = {
|
149 |
"?": pd.Interval(-1, 0, closed="right"),
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -21,7 +21,8 @@ class EvalResult:
|
|
21 |
org: str
|
22 |
model: str
|
23 |
revision: str # commit hash, "" if main
|
24 |
-
|
|
|
25 |
precision: Precision = Precision.Unknown
|
26 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
@@ -42,6 +43,9 @@ class EvalResult:
|
|
42 |
|
43 |
# Precision
|
44 |
precision = Precision.from_str(config.get("model_dtype"))
|
|
|
|
|
|
|
45 |
|
46 |
# Get model and org
|
47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
@@ -67,28 +71,44 @@ class EvalResult:
|
|
67 |
architecture = ";".join(architectures)
|
68 |
|
69 |
# Extract results available in this file (some results are split in several files)
|
70 |
-
|
71 |
for task in Tasks:
|
72 |
task = task.value
|
73 |
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
|
79 |
mean_acc = np.mean(accs) # * 100.0
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
return self(
|
83 |
eval_name=result_key,
|
84 |
full_model=full_model,
|
85 |
org=org,
|
86 |
model=model,
|
87 |
-
|
|
|
88 |
precision=precision,
|
89 |
revision=config.get("model_sha", ""),
|
90 |
still_on_hub=still_on_hub,
|
91 |
architecture=architecture,
|
|
|
|
|
|
|
92 |
)
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
@@ -111,29 +131,54 @@ class EvalResult:
|
|
111 |
)
|
112 |
print(f" Args used were - {request_file=}, {requests_path=}, {self.full_model=},")
|
113 |
|
114 |
-
def to_dict(self):
|
115 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
|
139 |
def get_request_file_for_model(requests_path, model_name, precision):
|
@@ -181,15 +226,16 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
181 |
|
182 |
# Store results of same eval together
|
183 |
eval_name = eval_result.eval_name
|
184 |
-
if eval_name in eval_results.keys():
|
185 |
-
|
186 |
-
else:
|
187 |
-
|
188 |
|
189 |
results = []
|
|
|
190 |
for v in eval_results.values():
|
191 |
try:
|
192 |
-
v.to_dict() # we test if the dict version is complete
|
193 |
results.append(v)
|
194 |
except KeyError: # not all eval values present
|
195 |
continue
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, M2Types
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
21 |
org: str
|
22 |
model: str
|
23 |
revision: str # commit hash, "" if main
|
24 |
+
dataset_results: dict
|
25 |
+
m2_type_results:dict
|
26 |
precision: Precision = Precision.Unknown
|
27 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
|
|
43 |
|
44 |
# Precision
|
45 |
precision = Precision.from_str(config.get("model_dtype"))
|
46 |
+
model_type = ModelType.from_str(config.get("model_type", ""))
|
47 |
+
license = config.get("license", "?")
|
48 |
+
num_params = config.get("num_params", "?")
|
49 |
|
50 |
# Get model and org
|
51 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
71 |
architecture = ";".join(architectures)
|
72 |
|
73 |
# Extract results available in this file (some results are split in several files)
|
74 |
+
dataset_results = {}
|
75 |
for task in Tasks:
|
76 |
task = task.value
|
77 |
|
78 |
# We average all scores of a given metric (not all metrics are present in all files)
|
79 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["dataset_results"].items() if task.benchmark == k])
|
80 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
81 |
continue
|
82 |
|
83 |
mean_acc = np.mean(accs) # * 100.0
|
84 |
+
dataset_results[task.benchmark] = mean_acc
|
85 |
+
|
86 |
+
types_results = {}
|
87 |
+
for m2_type in M2Types:
|
88 |
+
m2_type = m2_type.value
|
89 |
+
|
90 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
91 |
+
accs = np.array([v.get(m2_type.metric, None) for k, v in data["m2_type_results"].items() if m2_type.benchmark == k])
|
92 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
93 |
+
continue
|
94 |
+
|
95 |
+
mean_acc = np.mean(accs) # * 100.0
|
96 |
+
types_results[m2_type.benchmark] = mean_acc
|
97 |
|
98 |
return self(
|
99 |
eval_name=result_key,
|
100 |
full_model=full_model,
|
101 |
org=org,
|
102 |
model=model,
|
103 |
+
dataset_results=dataset_results,
|
104 |
+
m2_type_results=types_results,
|
105 |
precision=precision,
|
106 |
revision=config.get("model_sha", ""),
|
107 |
still_on_hub=still_on_hub,
|
108 |
architecture=architecture,
|
109 |
+
model_type=model_type,
|
110 |
+
num_params=num_params,
|
111 |
+
license=license
|
112 |
)
|
113 |
|
114 |
def update_with_request_file(self, requests_path):
|
|
|
131 |
)
|
132 |
print(f" Args used were - {request_file=}, {requests_path=}, {self.full_model=},")
|
133 |
|
134 |
+
def to_dict(self, subset):
|
135 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
136 |
+
if subset == "datasets":
|
137 |
+
average = sum([v for v in self.dataset_results.values() if v is not None]) / len(Tasks)
|
138 |
+
data_dict = {
|
139 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
140 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
141 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
142 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
143 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
144 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
145 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
146 |
+
AutoEvalColumn.revision.name: self.revision,
|
147 |
+
AutoEvalColumn.average.name: average,
|
148 |
+
AutoEvalColumn.license.name: self.license,
|
149 |
+
AutoEvalColumn.likes.name: self.likes,
|
150 |
+
AutoEvalColumn.params.name: self.num_params,
|
151 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
152 |
+
}
|
153 |
+
|
154 |
+
for task in Tasks:
|
155 |
+
data_dict[task.value.col_name] = self.dataset_results[task.value.benchmark]
|
156 |
+
|
157 |
+
return data_dict
|
158 |
+
|
159 |
+
if subset == "m2_types":
|
160 |
+
average = sum([v for v in self.m2_type_results.values() if v is not None]) / len(M2Types)
|
161 |
+
data_dict = {
|
162 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
163 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
164 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
165 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
166 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
167 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
168 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
169 |
+
AutoEvalColumn.revision.name: self.revision,
|
170 |
+
AutoEvalColumn.average.name: average,
|
171 |
+
AutoEvalColumn.license.name: self.license,
|
172 |
+
AutoEvalColumn.likes.name: self.likes,
|
173 |
+
AutoEvalColumn.params.name: self.num_params,
|
174 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
175 |
+
}
|
176 |
+
|
177 |
+
for m2_type in M2Types:
|
178 |
+
data_dict[m2_type.value.col_name] = self.m2_type_results[m2_type.value.benchmark]
|
179 |
+
|
180 |
+
return data_dict
|
181 |
+
|
182 |
|
183 |
|
184 |
def get_request_file_for_model(requests_path, model_name, precision):
|
|
|
226 |
|
227 |
# Store results of same eval together
|
228 |
eval_name = eval_result.eval_name
|
229 |
+
# if eval_name in eval_results.keys():
|
230 |
+
# eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
231 |
+
# else:
|
232 |
+
eval_results[eval_name] = eval_result
|
233 |
|
234 |
results = []
|
235 |
+
# m2_type_results = []
|
236 |
for v in eval_results.values():
|
237 |
try:
|
238 |
+
v.to_dict(subset="dataset") # we test if the dict version is complete
|
239 |
results.append(v)
|
240 |
except KeyError: # not all eval values present
|
241 |
continue
|
src/populate.py
CHANGED
@@ -8,13 +8,14 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
18 |
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, subset:str) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
+
all_data_json = [v.to_dict(subset=subset) for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
+
cols = list(set(df.columns).intersection(set(cols)))
|
19 |
df = df[cols].round(decimals=2)
|
20 |
|
21 |
# filter out if any of the benchmarks have not been produced
|