Spaces:
Running
Running
mervenoyan
commited on
Commit
Β·
ae1692d
1
Parent(s):
9541eae
misc improvements
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
-
from huggingface_hub.hf_api import create_repo, upload_folder, upload_file
|
4 |
from huggingface_hub.repository import Repository
|
5 |
import subprocess
|
6 |
import os
|
@@ -12,8 +12,9 @@ import dabl
|
|
12 |
import re
|
13 |
|
14 |
|
15 |
-
def analyze_datasets(dataset, dataset_name,
|
16 |
df = pd.read_csv(dataset.name)
|
|
|
17 |
if column is not None:
|
18 |
analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
|
19 |
else:
|
@@ -21,7 +22,7 @@ def analyze_datasets(dataset, dataset_name, username, token, column=None, pairwi
|
|
21 |
analyze_report.show_html('index.html', open_browser=False)
|
22 |
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
|
23 |
|
24 |
-
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
|
25 |
readme = f"---\ntitle: {dataset_name}\nemoji: β¨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
|
26 |
with open("README.md", "w+") as f:
|
27 |
f.write(readme)
|
@@ -40,30 +41,47 @@ def extract_estimator_config(model):
|
|
40 |
table += f"| {hyperparameter} | {value} |\n"
|
41 |
return table
|
42 |
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
df = pd.read_csv(dataset.name)
|
46 |
-
|
47 |
-
|
48 |
X = df_clean.drop(column, axis = 1)
|
49 |
y = df_clean[column]
|
|
|
50 |
with tempfile.TemporaryDirectory() as tmpdirname:
|
51 |
from contextlib import redirect_stdout
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
print('Logging training')
|
56 |
-
fc.fit(X, y)
|
57 |
repo_url = create_repo(repo_id = f"{username}/{dataset_name}", token = token)
|
58 |
|
59 |
readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\n---\n\n"
|
60 |
readme += f"## Baseline Model trained on {dataset_name} to predict {column}\n\n"
|
61 |
-
readme+="Metrics of the best model
|
62 |
for elem in str(fc.current_best_).split("\n"):
|
63 |
readme+= f"{elem}\n\n"
|
64 |
-
readme+= "\n\
|
65 |
readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
|
66 |
-
|
67 |
with open(f"{tmpdirname}/README.md", "w+") as f:
|
68 |
f.write(readme)
|
69 |
with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
|
@@ -76,7 +94,7 @@ def train_baseline(dataset, username, dataset_name, token, column):
|
|
76 |
|
77 |
with gr.Blocks() as demo:
|
78 |
main_title = gr.Markdown("""# Baseline Trainer πͺπβ¨""")
|
79 |
-
main_desc = gr.Markdown("""This app trains a baseline model for a given dataset and pushes it to your Hugging Face Hub Profile with a model card.""")
|
80 |
|
81 |
|
82 |
with gr.Tabs():
|
@@ -87,17 +105,16 @@ with gr.Blocks() as demo:
|
|
87 |
description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
|
88 |
dataset = gr.File(label = "Dataset")
|
89 |
column = gr.Text(label = "Enter target variable:")
|
|
|
90 |
dataset_name = gr.Text(label = "Enter dataset name:")
|
91 |
-
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
|
92 |
token = gr.Textbox(label = "Your Hugging Face Token")
|
93 |
-
username = gr.Textbox(label = "Your Hugging Face User Name")
|
94 |
inference_run = gr.Button("Train")
|
95 |
inference_progress = gr.StatusTracker(cover_container=True)
|
96 |
|
97 |
outcome = gr.outputs.Textbox(label = "Progress")
|
98 |
inference_run.click(
|
99 |
train_baseline,
|
100 |
-
inputs=[dataset,
|
101 |
outputs=outcome,
|
102 |
status_tracker=inference_progress,
|
103 |
)
|
@@ -110,15 +127,14 @@ with gr.Blocks() as demo:
|
|
110 |
column = gr.Text(label = "Compare dataset against a target variable (Optional)")
|
111 |
pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
|
112 |
token = gr.Textbox(label = "Your Hugging Face Token")
|
113 |
-
username = gr.Textbox(label = "Your Hugging Face User Name")
|
114 |
dataset_name = gr.Textbox(label = "Dataset Name")
|
115 |
-
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub
|
116 |
inference_run = gr.Button("Infer")
|
117 |
inference_progress = gr.StatusTracker(cover_container=True)
|
118 |
outcome = gr.outputs.Textbox()
|
119 |
inference_run.click(
|
120 |
analyze_datasets,
|
121 |
-
inputs=[dataset, dataset_name,
|
122 |
outputs=outcome,
|
123 |
status_tracker=inference_progress,
|
124 |
)
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
+
from huggingface_hub.hf_api import create_repo, upload_folder, upload_file, HfApi
|
4 |
from huggingface_hub.repository import Repository
|
5 |
import subprocess
|
6 |
import os
|
|
|
12 |
import re
|
13 |
|
14 |
|
15 |
+
def analyze_datasets(dataset, dataset_name, token, column=None, pairwise="off"):
|
16 |
df = pd.read_csv(dataset.name)
|
17 |
+
username = HfApi().whoami(token=token)["name"]
|
18 |
if column is not None:
|
19 |
analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
|
20 |
else:
|
|
|
22 |
analyze_report.show_html('index.html', open_browser=False)
|
23 |
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
|
24 |
|
25 |
+
upload_file(path_or_fileobj ="./index.html", path_in_repo = "./index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
|
26 |
readme = f"---\ntitle: {dataset_name}\nemoji: β¨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
|
27 |
with open("README.md", "w+") as f:
|
28 |
f.write(readme)
|
|
|
41 |
table += f"| {hyperparameter} | {value} |\n"
|
42 |
return table
|
43 |
|
44 |
+
def detect_training(df, column):
|
45 |
+
if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]:
|
46 |
+
trainer = dabl.SimpleRegressor()
|
47 |
+
elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]:
|
48 |
+
trainer = dabl.SimpleClassifier()
|
49 |
+
return trainer
|
50 |
+
|
51 |
+
def edit_types(df):
|
52 |
+
types = dabl.detect_types(df)
|
53 |
+
low_cardinality = types[types["low_card_int"] == True].index.tolist()
|
54 |
+
dirty_float = types[types["dirty_float"] == True].index.tolist()
|
55 |
+
type_hints = {}
|
56 |
+
for col in low_cardinality:
|
57 |
+
type_hints[col] = "categorical"
|
58 |
+
for col in dirty_float:
|
59 |
+
type_hints[col] = "continuous"
|
60 |
+
df_clean = dabl.clean(df, type_hints=type_hints)
|
61 |
+
return df_clean
|
62 |
+
|
63 |
+
def train_baseline(dataset, dataset_name, token, column):
|
64 |
df = pd.read_csv(dataset.name)
|
65 |
+
df_clean = edit_types(df)
|
66 |
+
fc = detect_training(df_clean, column)
|
67 |
X = df_clean.drop(column, axis = 1)
|
68 |
y = df_clean[column]
|
69 |
+
|
70 |
with tempfile.TemporaryDirectory() as tmpdirname:
|
71 |
from contextlib import redirect_stdout
|
72 |
|
73 |
+
fc.fit(X, y)
|
74 |
+
username = HfApi().whoami(token=token)["name"]
|
|
|
|
|
75 |
repo_url = create_repo(repo_id = f"{username}/{dataset_name}", token = token)
|
76 |
|
77 |
readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\n---\n\n"
|
78 |
readme += f"## Baseline Model trained on {dataset_name} to predict {column}\n\n"
|
79 |
+
readme+="**Metrics of the best model:**\n\n"
|
80 |
for elem in str(fc.current_best_).split("\n"):
|
81 |
readme+= f"{elem}\n\n"
|
82 |
+
readme+= "\n\n**See model plot below:**\n\n"
|
83 |
readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
|
84 |
+
readme+= "\n\nThis model is trained with dabl library as a baseline, for better results, use AutoTrain.\n\n"
|
85 |
with open(f"{tmpdirname}/README.md", "w+") as f:
|
86 |
f.write(readme)
|
87 |
with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
|
|
|
94 |
|
95 |
with gr.Blocks() as demo:
|
96 |
main_title = gr.Markdown("""# Baseline Trainer πͺπβ¨""")
|
97 |
+
main_desc = gr.Markdown("""This app trains a baseline model for a given dataset and pushes it to your Hugging Face Hub Profile with a model card. For better results, use AutoTrain.""")
|
98 |
|
99 |
|
100 |
with gr.Tabs():
|
|
|
105 |
description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
|
106 |
dataset = gr.File(label = "Dataset")
|
107 |
column = gr.Text(label = "Enter target variable:")
|
108 |
+
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
|
109 |
dataset_name = gr.Text(label = "Enter dataset name:")
|
|
|
110 |
token = gr.Textbox(label = "Your Hugging Face Token")
|
|
|
111 |
inference_run = gr.Button("Train")
|
112 |
inference_progress = gr.StatusTracker(cover_container=True)
|
113 |
|
114 |
outcome = gr.outputs.Textbox(label = "Progress")
|
115 |
inference_run.click(
|
116 |
train_baseline,
|
117 |
+
inputs=[dataset, dataset_name, token, column],
|
118 |
outputs=outcome,
|
119 |
status_tracker=inference_progress,
|
120 |
)
|
|
|
127 |
column = gr.Text(label = "Compare dataset against a target variable (Optional)")
|
128 |
pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
|
129 |
token = gr.Textbox(label = "Your Hugging Face Token")
|
|
|
130 |
dataset_name = gr.Textbox(label = "Dataset Name")
|
131 |
+
pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique repository name for your dataset report.")
|
132 |
inference_run = gr.Button("Infer")
|
133 |
inference_progress = gr.StatusTracker(cover_container=True)
|
134 |
outcome = gr.outputs.Textbox()
|
135 |
inference_run.click(
|
136 |
analyze_datasets,
|
137 |
+
inputs=[dataset, dataset_name, token, column, pairwise],
|
138 |
outputs=outcome,
|
139 |
status_tracker=inference_progress,
|
140 |
)
|
logs.txt
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Logging training
|
2 |
+
Running DummyClassifier()
|
3 |
+
accuracy: 0.643 average_precision: 0.357 roc_auc: 0.500 recall_macro: 0.500 f1_macro: 0.392
|
4 |
+
=== new best DummyClassifier() (using recall_macro):
|
5 |
+
accuracy: 0.643 average_precision: 0.357 roc_auc: 0.500 recall_macro: 0.500 f1_macro: 0.392
|
6 |
+
|
7 |
+
Running GaussianNB()
|
8 |
+
accuracy: 0.623 average_precision: 0.505 roc_auc: 0.590 recall_macro: 0.560 f1_macro: 0.549
|
9 |
+
=== new best GaussianNB() (using recall_macro):
|
10 |
+
accuracy: 0.623 average_precision: 0.505 roc_auc: 0.590 recall_macro: 0.560 f1_macro: 0.549
|
11 |
+
|
12 |
+
Running MultinomialNB()
|
13 |
+
accuracy: 0.647 average_precision: 0.481 roc_auc: 0.609 recall_macro: 0.589 f1_macro: 0.588
|
14 |
+
=== new best MultinomialNB() (using recall_macro):
|
15 |
+
accuracy: 0.647 average_precision: 0.481 roc_auc: 0.609 recall_macro: 0.589 f1_macro: 0.588
|
16 |
+
|
17 |
+
Running DecisionTreeClassifier(class_weight='balanced', max_depth=1)
|
18 |
+
accuracy: 0.586 average_precision: 0.401 roc_auc: 0.568 recall_macro: 0.568 f1_macro: 0.558
|
19 |
+
Running DecisionTreeClassifier(class_weight='balanced', max_depth=5)
|
20 |
+
accuracy: 0.590 average_precision: 0.419 roc_auc: 0.564 recall_macro: 0.576 f1_macro: 0.560
|
21 |
+
Running DecisionTreeClassifier(class_weight='balanced', min_impurity_decrease=0.01)
|
22 |
+
accuracy: 0.582 average_precision: 0.393 roc_auc: 0.563 recall_macro: 0.567 f1_macro: 0.555
|
23 |
+
Running LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000)
|
24 |
+
accuracy: 0.574 average_precision: 0.487 roc_auc: 0.425 recall_macro: 0.548 f1_macro: 0.547
|
25 |
+
Running LogisticRegression(class_weight='balanced', max_iter=1000)
|
26 |
+
accuracy: 0.578 average_precision: 0.470 roc_auc: 0.437 recall_macro: 0.562 f1_macro: 0.557
|
27 |
+
|
28 |
+
Best model:
|
29 |
+
Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('multinomialnb', MultinomialNB())])
|
30 |
+
Best Scores:
|
31 |
+
accuracy: 0.647 average_precision: 0.481 roc_auc: 0.609 recall_macro: 0.589 f1_macro: 0.588
|