Spaces:
Running
Running
Jellyfish042
commited on
Commit
Β·
cc8a66b
1
Parent(s):
49c2344
update
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ import requests
|
|
6 |
import huggingface_hub
|
7 |
from huggingface_hub.utils._errors import EntryNotFoundError, RepositoryNotFoundError
|
8 |
from dotenv import load_dotenv
|
|
|
9 |
|
10 |
load_dotenv()
|
11 |
webhook_url = os.environ.get("WEBHOOK_URL")
|
@@ -119,21 +120,25 @@ def get_folders_matching_format(directory):
|
|
119 |
|
120 |
|
121 |
def get_unique_column_names(all_data):
|
122 |
-
column_names =
|
123 |
|
124 |
for folder_name, files in all_data.items():
|
125 |
for file_name, sheets in files.items():
|
126 |
for sheet_name, dataframe in sheets.items():
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
column_names.remove('Parameters Count (B)')
|
131 |
|
132 |
-
return list(column_names)
|
133 |
|
134 |
|
135 |
-
def update_table(period: str,
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
137 |
target_data = all_data[period]
|
138 |
target_metric = metric_to_sheet[metric]
|
139 |
|
@@ -142,15 +147,38 @@ def update_table(period: str, models: list, metric: str, visible_columns: list,
|
|
142 |
combined_data = pd.concat([target_data[model][target_metric] for model in target_model_size], axis=0)
|
143 |
combined_data['Name'] = combined_data['Name'].apply(lambda x: x.replace('.pth', ''))
|
144 |
|
|
|
|
|
145 |
if 'Average (The lower the better)' in combined_data.columns:
|
146 |
relevant_columns = [col for col in visible_columns if
|
147 |
col not in ['Name', 'Parameters Count (B)', 'Average (The lower the better)']]
|
148 |
combined_data['Average (The lower the better)'] = round(combined_data[relevant_columns].mean(axis=1), 3)
|
149 |
|
150 |
sorted_data = combined_data.sort_values(by=sort_by, ascending=ascending)
|
151 |
-
|
|
|
152 |
filtered_data = sorted_data[visible_columns]
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
else:
|
155 |
return pd.DataFrame()
|
156 |
|
@@ -210,10 +238,25 @@ initial_period = time_list[-1]
|
|
210 |
initial_models = model_size_list[:1]
|
211 |
initial_metric = metric_list[0]
|
212 |
initial_columns = get_unique_column_names(all_data)
|
|
|
213 |
|
214 |
-
initial_data = update_table(initial_period, initial_models, initial_metric, initial_columns)
|
215 |
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
gr.HTML('<h1 style="text-align:center"><span style="font-size:1.3em">π Uncheatable Eval Leaderboard</span></h1>')
|
218 |
gr.HTML(
|
219 |
"<h1 style='text-align:center'><span style='font-size:0.8em'>Welcome to Uncheatable Eval, where fancy fine-tuning and cheating wonβt work π«; only compute π», data π, and real innovation π₯ can prevail!</span></h1>")
|
@@ -225,20 +268,30 @@ with gr.Blocks(css=".gradio-container{max-width:95%!important} .tab-buttons butt
|
|
225 |
model_selector = gr.CheckboxGroup(label="Model", choices=model_size_list, value=model_size_list[0])
|
226 |
metric_selector = gr.Dropdown(label="Metric", choices=metric_list, value=metric_list[0])
|
227 |
with gr.Column():
|
|
|
|
|
|
|
228 |
colfilter = gr.CheckboxGroup(label="Data Source",
|
229 |
choices=get_unique_column_names(all_data),
|
230 |
value=get_unique_column_names(all_data))
|
231 |
|
232 |
-
table = gr.Dataframe(initial_data)
|
233 |
|
234 |
-
period_selector.change(update_table,
|
|
|
235 |
outputs=table)
|
236 |
-
model_selector.change(update_table,
|
|
|
237 |
outputs=table)
|
238 |
-
metric_selector.change(update_table,
|
|
|
239 |
outputs=table)
|
240 |
-
colfilter.change(update_table,
|
|
|
241 |
outputs=table)
|
|
|
|
|
|
|
242 |
|
243 |
with gr.Tab("π MultiLang"):
|
244 |
gr.Markdown("## Coming soon...")
|
|
|
6 |
import huggingface_hub
|
7 |
from huggingface_hub.utils._errors import EntryNotFoundError, RepositoryNotFoundError
|
8 |
from dotenv import load_dotenv
|
9 |
+
from matplotlib.colors import LinearSegmentedColormap
|
10 |
|
11 |
load_dotenv()
|
12 |
webhook_url = os.environ.get("WEBHOOK_URL")
|
|
|
120 |
|
121 |
|
122 |
def get_unique_column_names(all_data):
|
123 |
+
column_names = {}
|
124 |
|
125 |
for folder_name, files in all_data.items():
|
126 |
for file_name, sheets in files.items():
|
127 |
for sheet_name, dataframe in sheets.items():
|
128 |
+
for column in dataframe.columns:
|
129 |
+
if column not in ['Name', 'Average (The lower the better)', 'Parameters Count (B)']:
|
130 |
+
column_names[column] = None
|
|
|
131 |
|
132 |
+
return list(column_names.keys())
|
133 |
|
134 |
|
135 |
+
def update_table(period: str,
|
136 |
+
models: list,
|
137 |
+
metric: str,
|
138 |
+
visible_columns: list,
|
139 |
+
color_columns: list,
|
140 |
+
sort_by: str = 'Average (The lower the better)',
|
141 |
+
ascending: bool = True):
|
142 |
target_data = all_data[period]
|
143 |
target_metric = metric_to_sheet[metric]
|
144 |
|
|
|
147 |
combined_data = pd.concat([target_data[model][target_metric] for model in target_model_size], axis=0)
|
148 |
combined_data['Name'] = combined_data['Name'].apply(lambda x: x.replace('.pth', ''))
|
149 |
|
150 |
+
combined_data.reset_index(drop=True, inplace=True)
|
151 |
+
|
152 |
if 'Average (The lower the better)' in combined_data.columns:
|
153 |
relevant_columns = [col for col in visible_columns if
|
154 |
col not in ['Name', 'Parameters Count (B)', 'Average (The lower the better)']]
|
155 |
combined_data['Average (The lower the better)'] = round(combined_data[relevant_columns].mean(axis=1), 3)
|
156 |
|
157 |
sorted_data = combined_data.sort_values(by=sort_by, ascending=ascending)
|
158 |
+
sorted_data = sorted_data.rename(columns={'Average (The lower the better)': 'Average (lower=better)'})
|
159 |
+
visible_columns = ['Name', 'Parameters Count (B)', 'Average (lower=better)'] + visible_columns
|
160 |
filtered_data = sorted_data[visible_columns]
|
161 |
+
|
162 |
+
filtered_data.columns = [col.replace('_', ' ') for col in filtered_data.columns]
|
163 |
+
|
164 |
+
formatter = {col: "{:.3f}" for col in filtered_data.columns if
|
165 |
+
filtered_data[col].dtype in ['float64', 'float32']}
|
166 |
+
|
167 |
+
# color gradient
|
168 |
+
colors = ["#63be7b", "#ffffff", "#f8696b"]
|
169 |
+
cmap = LinearSegmentedColormap.from_list("custom_cmap", colors)
|
170 |
+
target_color_columns = []
|
171 |
+
if 'Average' in color_columns:
|
172 |
+
target_color_columns.append('Average (lower=better)')
|
173 |
+
if 'Individual Tests' in color_columns:
|
174 |
+
target_color_columns.extend([col for col in filtered_data.columns if col not in ['Name', 'Parameters Count (B)', 'Average (lower=better)']])
|
175 |
+
|
176 |
+
styler = filtered_data.style.format(formatter).background_gradient(
|
177 |
+
cmap=cmap,
|
178 |
+
subset=target_color_columns
|
179 |
+
)
|
180 |
+
|
181 |
+
return styler
|
182 |
else:
|
183 |
return pd.DataFrame()
|
184 |
|
|
|
238 |
initial_models = model_size_list[:1]
|
239 |
initial_metric = metric_list[0]
|
240 |
initial_columns = get_unique_column_names(all_data)
|
241 |
+
initial_colors = ['Average']
|
242 |
|
243 |
+
initial_data = update_table(initial_period, initial_models, initial_metric, initial_columns, initial_colors)
|
244 |
|
245 |
+
css = '''
|
246 |
+
.gradio-container {
|
247 |
+
max-width: 95% !important;
|
248 |
+
}
|
249 |
+
.tab-buttons button {
|
250 |
+
font-size: 1.3em;
|
251 |
+
}
|
252 |
+
.gr-dataframe th {
|
253 |
+
white-space: normal;
|
254 |
+
word-break: break-word;
|
255 |
+
}
|
256 |
+
|
257 |
+
'''
|
258 |
+
|
259 |
+
with gr.Blocks(css=css) as demo:
|
260 |
gr.HTML('<h1 style="text-align:center"><span style="font-size:1.3em">π Uncheatable Eval Leaderboard</span></h1>')
|
261 |
gr.HTML(
|
262 |
"<h1 style='text-align:center'><span style='font-size:0.8em'>Welcome to Uncheatable Eval, where fancy fine-tuning and cheating wonβt work π«; only compute π», data π, and real innovation π₯ can prevail!</span></h1>")
|
|
|
268 |
model_selector = gr.CheckboxGroup(label="Model", choices=model_size_list, value=model_size_list[0])
|
269 |
metric_selector = gr.Dropdown(label="Metric", choices=metric_list, value=metric_list[0])
|
270 |
with gr.Column():
|
271 |
+
color_selector = gr.CheckboxGroup(label="Colored Columns",
|
272 |
+
choices=['Average', 'Individual Tests'],
|
273 |
+
value=['Average'])
|
274 |
colfilter = gr.CheckboxGroup(label="Data Source",
|
275 |
choices=get_unique_column_names(all_data),
|
276 |
value=get_unique_column_names(all_data))
|
277 |
|
278 |
+
table = gr.Dataframe(initial_data, column_widths=[110, 35, 35, 35, 35, 35, 35, 35, 35, 35], wrap=True)
|
279 |
|
280 |
+
period_selector.change(update_table,
|
281 |
+
inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector],
|
282 |
outputs=table)
|
283 |
+
model_selector.change(update_table,
|
284 |
+
inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector],
|
285 |
outputs=table)
|
286 |
+
metric_selector.change(update_table,
|
287 |
+
inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector],
|
288 |
outputs=table)
|
289 |
+
colfilter.change(update_table,
|
290 |
+
inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector],
|
291 |
outputs=table)
|
292 |
+
color_selector.change(update_table,
|
293 |
+
inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector],
|
294 |
+
outputs=table)
|
295 |
|
296 |
with gr.Tab("π MultiLang"):
|
297 |
gr.Markdown("## Coming soon...")
|