mervenoyan commited on
Commit
ae1692d
Β·
1 Parent(s): 9541eae

misc improvements

Browse files
Files changed (2) hide show
  1. app.py +37 -21
  2. logs.txt +31 -0
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import pandas as pd
3
- from huggingface_hub.hf_api import create_repo, upload_folder, upload_file
4
  from huggingface_hub.repository import Repository
5
  import subprocess
6
  import os
@@ -12,8 +12,9 @@ import dabl
12
  import re
13
 
14
 
15
- def analyze_datasets(dataset, dataset_name, username, token, column=None, pairwise="off"):
16
  df = pd.read_csv(dataset.name)
 
17
  if column is not None:
18
  analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
19
  else:
@@ -21,7 +22,7 @@ def analyze_datasets(dataset, dataset_name, username, token, column=None, pairwi
21
  analyze_report.show_html('index.html', open_browser=False)
22
  repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
23
 
24
- upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
25
  readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
26
  with open("README.md", "w+") as f:
27
  f.write(readme)
@@ -40,30 +41,47 @@ def extract_estimator_config(model):
40
  table += f"| {hyperparameter} | {value} |\n"
41
  return table
42
 
43
-
44
- def train_baseline(dataset, username, dataset_name, token, column):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  df = pd.read_csv(dataset.name)
46
- fc = dabl.SimpleClassifier(random_state=0)
47
- df_clean = dabl.clean(df)
48
  X = df_clean.drop(column, axis = 1)
49
  y = df_clean[column]
 
50
  with tempfile.TemporaryDirectory() as tmpdirname:
51
  from contextlib import redirect_stdout
52
 
53
- with open('logs.txt', 'w') as f:
54
- with redirect_stdout(f):
55
- print('Logging training')
56
- fc.fit(X, y)
57
  repo_url = create_repo(repo_id = f"{username}/{dataset_name}", token = token)
58
 
59
  readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\n---\n\n"
60
  readme += f"## Baseline Model trained on {dataset_name} to predict {column}\n\n"
61
- readme+="Metrics of the best model:\n\n"
62
  for elem in str(fc.current_best_).split("\n"):
63
  readme+= f"{elem}\n\n"
64
- readme+= "\n\nSee model plot below:\n\n"
65
  readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
66
-
67
  with open(f"{tmpdirname}/README.md", "w+") as f:
68
  f.write(readme)
69
  with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
@@ -76,7 +94,7 @@ def train_baseline(dataset, username, dataset_name, token, column):
76
 
77
  with gr.Blocks() as demo:
78
  main_title = gr.Markdown("""# Baseline Trainer πŸͺ„πŸŒŸβœ¨""")
79
- main_desc = gr.Markdown("""This app trains a baseline model for a given dataset and pushes it to your Hugging Face Hub Profile with a model card.""")
80
 
81
 
82
  with gr.Tabs():
@@ -87,17 +105,16 @@ with gr.Blocks() as demo:
87
  description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
88
  dataset = gr.File(label = "Dataset")
89
  column = gr.Text(label = "Enter target variable:")
 
90
  dataset_name = gr.Text(label = "Enter dataset name:")
91
- pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
92
  token = gr.Textbox(label = "Your Hugging Face Token")
93
- username = gr.Textbox(label = "Your Hugging Face User Name")
94
  inference_run = gr.Button("Train")
95
  inference_progress = gr.StatusTracker(cover_container=True)
96
 
97
  outcome = gr.outputs.Textbox(label = "Progress")
98
  inference_run.click(
99
  train_baseline,
100
- inputs=[dataset, username, dataset_name, token, column],
101
  outputs=outcome,
102
  status_tracker=inference_progress,
103
  )
@@ -110,15 +127,14 @@ with gr.Blocks() as demo:
110
  column = gr.Text(label = "Compare dataset against a target variable (Optional)")
111
  pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
112
  token = gr.Textbox(label = "Your Hugging Face Token")
113
- username = gr.Textbox(label = "Your Hugging Face User Name")
114
  dataset_name = gr.Textbox(label = "Dataset Name")
115
- pushing_desc = gr.Markdown("This app needs your Hugging Face Hub user name, token and a unique name for your dataset report.")
116
  inference_run = gr.Button("Infer")
117
  inference_progress = gr.StatusTracker(cover_container=True)
118
  outcome = gr.outputs.Textbox()
119
  inference_run.click(
120
  analyze_datasets,
121
- inputs=[dataset, dataset_name, username, token, column, pairwise],
122
  outputs=outcome,
123
  status_tracker=inference_progress,
124
  )
 
1
  import gradio as gr
2
  import pandas as pd
3
+ from huggingface_hub.hf_api import create_repo, upload_folder, upload_file, HfApi
4
  from huggingface_hub.repository import Repository
5
  import subprocess
6
  import os
 
12
  import re
13
 
14
 
15
+ def analyze_datasets(dataset, dataset_name, token, column=None, pairwise="off"):
16
  df = pd.read_csv(dataset.name)
17
+ username = HfApi().whoami(token=token)["name"]
18
  if column is not None:
19
  analyze_report = sv.analyze(df, target_feat=column, pairwise_analysis=pairwise)
20
  else:
 
22
  analyze_report.show_html('index.html', open_browser=False)
23
  repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
24
 
25
+ upload_file(path_or_fileobj ="./index.html", path_in_repo = "./index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
26
  readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
27
  with open("README.md", "w+") as f:
28
  f.write(readme)
 
41
  table += f"| {hyperparameter} | {value} |\n"
42
  return table
43
 
44
+ def detect_training(df, column):
45
+ if dabl.detect_types(df)["continuous"][column] or dabl.detect_types(df)["dirty_float"][column]:
46
+ trainer = dabl.SimpleRegressor()
47
+ elif dabl.detect_types(df)["categorical"][column] or dabl.detect_types(df)["low_card_int"][column] or dabl.detect_types(df)["free_string"][column]:
48
+ trainer = dabl.SimpleClassifier()
49
+ return trainer
50
+
51
+ def edit_types(df):
52
+ types = dabl.detect_types(df)
53
+ low_cardinality = types[types["low_card_int"] == True].index.tolist()
54
+ dirty_float = types[types["dirty_float"] == True].index.tolist()
55
+ type_hints = {}
56
+ for col in low_cardinality:
57
+ type_hints[col] = "categorical"
58
+ for col in dirty_float:
59
+ type_hints[col] = "continuous"
60
+ df_clean = dabl.clean(df, type_hints=type_hints)
61
+ return df_clean
62
+
63
+ def train_baseline(dataset, dataset_name, token, column):
64
  df = pd.read_csv(dataset.name)
65
+ df_clean = edit_types(df)
66
+ fc = detect_training(df_clean, column)
67
  X = df_clean.drop(column, axis = 1)
68
  y = df_clean[column]
69
+
70
  with tempfile.TemporaryDirectory() as tmpdirname:
71
  from contextlib import redirect_stdout
72
 
73
+ fc.fit(X, y)
74
+ username = HfApi().whoami(token=token)["name"]
 
 
75
  repo_url = create_repo(repo_id = f"{username}/{dataset_name}", token = token)
76
 
77
  readme = f"---\nlicense: apache-2.0\nlibrary_name: sklearn\n---\n\n"
78
  readme += f"## Baseline Model trained on {dataset_name} to predict {column}\n\n"
79
+ readme+="**Metrics of the best model:**\n\n"
80
  for elem in str(fc.current_best_).split("\n"):
81
  readme+= f"{elem}\n\n"
82
+ readme+= "\n\n**See model plot below:**\n\n"
83
  readme+= re.sub(r"\n\s+", "", str(estimator_html_repr(fc.est_)))
84
+ readme+= "\n\nThis model is trained with dabl library as a baseline, for better results, use AutoTrain.\n\n"
85
  with open(f"{tmpdirname}/README.md", "w+") as f:
86
  f.write(readme)
87
  with open(f"{tmpdirname}/clf.pkl", mode="bw") as f:
 
94
 
95
  with gr.Blocks() as demo:
96
  main_title = gr.Markdown("""# Baseline Trainer πŸͺ„πŸŒŸβœ¨""")
97
+ main_desc = gr.Markdown("""This app trains a baseline model for a given dataset and pushes it to your Hugging Face Hub Profile with a model card. For better results, use AutoTrain.""")
98
 
99
 
100
  with gr.Tabs():
 
105
  description = gr.Markdown("This app trains a model and pushes it to your Hugging Face Hub Profile.")
106
  dataset = gr.File(label = "Dataset")
107
  column = gr.Text(label = "Enter target variable:")
108
+ pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique name for your dataset report.")
109
  dataset_name = gr.Text(label = "Enter dataset name:")
 
110
  token = gr.Textbox(label = "Your Hugging Face Token")
 
111
  inference_run = gr.Button("Train")
112
  inference_progress = gr.StatusTracker(cover_container=True)
113
 
114
  outcome = gr.outputs.Textbox(label = "Progress")
115
  inference_run.click(
116
  train_baseline,
117
+ inputs=[dataset, dataset_name, token, column],
118
  outputs=outcome,
119
  status_tracker=inference_progress,
120
  )
 
127
  column = gr.Text(label = "Compare dataset against a target variable (Optional)")
128
  pairwise = gr.Radio(["off", "on"], label = "Enable pairwise analysis")
129
  token = gr.Textbox(label = "Your Hugging Face Token")
 
130
  dataset_name = gr.Textbox(label = "Dataset Name")
131
+ pushing_desc = gr.Markdown("This app needs your Hugging Face Hub token and a unique repository name for your dataset report.")
132
  inference_run = gr.Button("Infer")
133
  inference_progress = gr.StatusTracker(cover_container=True)
134
  outcome = gr.outputs.Textbox()
135
  inference_run.click(
136
  analyze_datasets,
137
+ inputs=[dataset, dataset_name, token, column, pairwise],
138
  outputs=outcome,
139
  status_tracker=inference_progress,
140
  )
logs.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Logging training
2
+ Running DummyClassifier()
3
+ accuracy: 0.643 average_precision: 0.357 roc_auc: 0.500 recall_macro: 0.500 f1_macro: 0.392
4
+ === new best DummyClassifier() (using recall_macro):
5
+ accuracy: 0.643 average_precision: 0.357 roc_auc: 0.500 recall_macro: 0.500 f1_macro: 0.392
6
+
7
+ Running GaussianNB()
8
+ accuracy: 0.623 average_precision: 0.505 roc_auc: 0.590 recall_macro: 0.560 f1_macro: 0.549
9
+ === new best GaussianNB() (using recall_macro):
10
+ accuracy: 0.623 average_precision: 0.505 roc_auc: 0.590 recall_macro: 0.560 f1_macro: 0.549
11
+
12
+ Running MultinomialNB()
13
+ accuracy: 0.647 average_precision: 0.481 roc_auc: 0.609 recall_macro: 0.589 f1_macro: 0.588
14
+ === new best MultinomialNB() (using recall_macro):
15
+ accuracy: 0.647 average_precision: 0.481 roc_auc: 0.609 recall_macro: 0.589 f1_macro: 0.588
16
+
17
+ Running DecisionTreeClassifier(class_weight='balanced', max_depth=1)
18
+ accuracy: 0.586 average_precision: 0.401 roc_auc: 0.568 recall_macro: 0.568 f1_macro: 0.558
19
+ Running DecisionTreeClassifier(class_weight='balanced', max_depth=5)
20
+ accuracy: 0.590 average_precision: 0.419 roc_auc: 0.564 recall_macro: 0.576 f1_macro: 0.560
21
+ Running DecisionTreeClassifier(class_weight='balanced', min_impurity_decrease=0.01)
22
+ accuracy: 0.582 average_precision: 0.393 roc_auc: 0.563 recall_macro: 0.567 f1_macro: 0.555
23
+ Running LogisticRegression(C=0.1, class_weight='balanced', max_iter=1000)
24
+ accuracy: 0.574 average_precision: 0.487 roc_auc: 0.425 recall_macro: 0.548 f1_macro: 0.547
25
+ Running LogisticRegression(class_weight='balanced', max_iter=1000)
26
+ accuracy: 0.578 average_precision: 0.470 roc_auc: 0.437 recall_macro: 0.562 f1_macro: 0.557
27
+
28
+ Best model:
29
+ Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('multinomialnb', MultinomialNB())])
30
+ Best Scores:
31
+ accuracy: 0.647 average_precision: 0.481 roc_auc: 0.609 recall_macro: 0.589 f1_macro: 0.588