prthgo commited on
Commit
2ab6e66
·
1 Parent(s): 9c48475

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +1046 -0
app.py ADDED
@@ -0,0 +1,1046 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import io
5
+ import matplotlib.pyplot as plt
6
+ from matplotlib.ticker import PercentFormatter
7
+ import seaborn as sns
8
+ from sklearn.preprocessing import (
9
+ OneHotEncoder,
10
+ OrdinalEncoder,
11
+ StandardScaler,
12
+ MinMaxScaler,
13
+ )
14
+ from sklearn.model_selection import train_test_split
15
+ from imblearn.under_sampling import RandomUnderSampler
16
+ from imblearn.over_sampling import RandomOverSampler, SMOTE
17
+ from sklearn.linear_model import Ridge, Lasso, LogisticRegression
18
+ from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
19
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
20
+ from sklearn.svm import SVR, SVC
21
+ from sklearn.naive_bayes import MultinomialNB
22
+ from xgboost import XGBRFRegressor, XGBRFClassifier
23
+ from lightgbm import LGBMRegressor, LGBMClassifier
24
+ from sklearn.metrics import (
25
+ mean_absolute_error,
26
+ mean_squared_error,
27
+ mean_squared_error,
28
+ r2_score,
29
+ )
30
+ from sklearn.metrics import (
31
+ accuracy_score,
32
+ f1_score,
33
+ confusion_matrix,
34
+ precision_score,
35
+ recall_score,
36
+ )
37
+ import pickle
38
+
39
+ st.set_page_config(page_title="Tabular Data Analysis and Auto ML", page_icon="🤖")
40
+ sns.set_style("white")
41
+ sns.set_context("poster", font_scale=0.7)
42
+ palette = [
43
+ "#1d7874",
44
+ "#679289",
45
+ "#f4c095",
46
+ "#ee2e31",
47
+ "#ffb563",
48
+ "#918450",
49
+ "#f85e00",
50
+ "#a41623",
51
+ "#9a031e",
52
+ "#d6d6d6",
53
+ "#ffee32",
54
+ "#ffd100",
55
+ "#333533",
56
+ "#202020",
57
+ ]
58
+
59
+
60
+ def main():
61
+ file = st.sidebar.file_uploader("Upload Your CSV File Here: ")
62
+ process = st.sidebar.button("Process")
63
+ option = st.sidebar.radio(
64
+ "Select an Option: ",
65
+ (
66
+ "Basic EDA",
67
+ "Univariate Analysis",
68
+ "Bivariate Analysis",
69
+ "Preprocess",
70
+ "Training and Evaluation",
71
+ ),
72
+ )
73
+ placeholder = st.empty()
74
+ placeholder.markdown(
75
+ "<h1 style='text-align: center;'>Welcome to Tabular Data Analysis and Auto ML🤖</h1>",
76
+ unsafe_allow_html=True
77
+ )
78
+
79
+
80
+ if file is not None and process:
81
+ data = load_csv(file)
82
+ st.session_state["data"] = data
83
+
84
+ if "data" in st.session_state:
85
+ data = st.session_state["data"]
86
+ placeholder.empty()
87
+
88
+ if option == "Basic EDA":
89
+ st.markdown(
90
+ "<h1 style='text-align: center;'>Basic EDA</h1>", unsafe_allow_html=True
91
+ )
92
+
93
+ st.subheader("Data Overview")
94
+ st.write(data_overview(data))
95
+ st.write(duplicate(data))
96
+ st.dataframe(data.head())
97
+
98
+ st.subheader("Data Types and Unique Value Counts")
99
+ display_data_info(data)
100
+
101
+ st.subheader("Missing Data")
102
+ missing_data(data)
103
+
104
+ st.subheader("Value Counts")
105
+ value_counts(data)
106
+
107
+ st.subheader("Descriptive Statistics")
108
+ st.write(data.describe().T)
109
+
110
+ if option == "Univariate Analysis":
111
+ st.markdown(
112
+ "<h1 style='text-align: center;'>Univariate Analysis</h1>",
113
+ unsafe_allow_html=True,
114
+ )
115
+ plot = st.radio(
116
+ "Select a chart: ",
117
+ ("Count Plot", "Pie Chart", "Histogram", "Violin Plot", "Scatter Plot"),
118
+ )
119
+
120
+ if plot == "Count Plot":
121
+ column = st.selectbox(
122
+ "Select a column", [""] + list(data.select_dtypes("O"))
123
+ )
124
+ if column:
125
+ countplot(data, column)
126
+
127
+ if plot == "Pie Chart":
128
+ column = st.selectbox(
129
+ "Select a column", [""] + list(data.select_dtypes("O"))
130
+ )
131
+ if column:
132
+ piechart(data, column)
133
+
134
+ if plot == "Histogram":
135
+ column = st.selectbox(
136
+ "Select a column",
137
+ [""] + list(data.select_dtypes(include=["int", "float"])),
138
+ )
139
+ if column:
140
+ histogram(data, column)
141
+
142
+ if plot == "Violin Plot":
143
+ column = st.selectbox(
144
+ "Select a column",
145
+ [""] + list(data.select_dtypes(include=["int", "float"])),
146
+ )
147
+ if column:
148
+ violinplot(data, column)
149
+
150
+ if plot == "Scatter Plot":
151
+ column = st.selectbox(
152
+ "Select a column",
153
+ [""] + list(data.select_dtypes(include=["int", "float"])),
154
+ )
155
+ if column:
156
+ scatterplot(data, column)
157
+
158
+ if option == "Bivariate Analysis":
159
+ st.markdown(
160
+ "<h1 style='text-align: center;'>Bivariate Analysis</h1>",
161
+ unsafe_allow_html=True,
162
+ )
163
+ plot = st.radio(
164
+ "Select a chart: ",
165
+ ("Scatter Plot", "Bar Plot", "Box Plot", "Pareto Chart"),
166
+ )
167
+
168
+ if plot == "Scatter Plot":
169
+ columns = st.multiselect(
170
+ "Select two columns",
171
+ [""] + list(data.select_dtypes(include=["int", "float"])),
172
+ )
173
+
174
+ if columns:
175
+ biscatterplot(data, columns)
176
+
177
+ if plot == "Bar Plot":
178
+ columns = st.multiselect("Select two columns", list(data.columns))
179
+
180
+ if columns:
181
+ bibarplot(data, columns)
182
+
183
+ if plot == "Box Plot":
184
+ columns = st.multiselect("Select two columns", list(data.columns))
185
+
186
+ if columns:
187
+ biboxplot(data, columns)
188
+
189
+ if plot == "Pareto Chart":
190
+ column = st.selectbox(
191
+ "Select a columns",
192
+ [""] + list(data.select_dtypes(include="object")),
193
+ )
194
+
195
+ if column:
196
+ paretoplot(data, column)
197
+
198
+ if option == "Preprocess":
199
+ st.markdown(
200
+ "<h1 style='text-align: center;'>Data Preprocessing</h1>",
201
+ unsafe_allow_html=True,
202
+ )
203
+
204
+ operation = st.radio(
205
+ "Select preprocessing step: ",
206
+ (
207
+ "Drop Columns",
208
+ "Handling Missing Values",
209
+ "Encode Categorical Features",
210
+ ),
211
+ )
212
+
213
+ if operation == "Drop Columns":
214
+ columns = st.multiselect("Select Columns to drop: ", (data.columns))
215
+ drop_columns = st.button("Drop Columns")
216
+ if drop_columns:
217
+ data.drop(columns, axis=1, inplace=True)
218
+ st.success("Dropped selected columns✅✅✅")
219
+
220
+ elif operation == "Handling Missing Values":
221
+ num_missing = st.selectbox(
222
+ "Select a Approach (Numerical columns only): ",
223
+ ("", "Drop", "Backward Fill", "Forward Fill", "Mean", "Median"),
224
+ ).lower()
225
+
226
+ cat_missing = st.selectbox(
227
+ "Select a Approach (Categorical columns only): ",
228
+ ("", "Drop", "Most Frequent Values", "Replace with 'Unknown'"),
229
+ ).lower()
230
+ hmv = st.button("Handle Missing Values")
231
+
232
+ if hmv:
233
+ if num_missing:
234
+ num_data = data.select_dtypes(include=["int64", "float64"])
235
+
236
+ if num_missing == "drop":
237
+ data = data.dropna(subset=num_data.columns)
238
+
239
+ elif num_missing in [
240
+ "mean",
241
+ "median",
242
+ "backward fill",
243
+ "forward fill",
244
+ ]:
245
+ if num_missing == "mean":
246
+ fill_values = num_data.mean()
247
+ elif num_missing == "median":
248
+ fill_values = num_data.median()
249
+ elif num_missing == "backward fill":
250
+ fill_values = num_data.bfill()
251
+ elif num_missing == "forward fill":
252
+ fill_values = num_data.ffill()
253
+
254
+ data.fillna(value=fill_values, inplace=True)
255
+
256
+ st.success(
257
+ "Imputed missing values in numerical columns with selected approach."
258
+ )
259
+
260
+ if cat_missing:
261
+ cat_data = data.select_dtypes(exclude=["int", "float"])
262
+
263
+ if cat_missing == "drop":
264
+ data = data.dropna(subset=cat_data.columns)
265
+
266
+ elif cat_missing == "most frequent values":
267
+ mode_values = data[cat_data.columns].mode().iloc[0]
268
+ data[cat_data.columns] = data[cat_data.columns].fillna(
269
+ mode_values
270
+ )
271
+
272
+ elif cat_missing == "replace with 'unknown'":
273
+ data[cat_data.columns] = data[cat_data.columns].fillna(
274
+ "Unknown"
275
+ )
276
+
277
+ st.success(
278
+ "Imputed missing values in categorical columns with selected approach."
279
+ )
280
+
281
+ elif operation == "Encode Categorical Features":
282
+ oe_columns = st.multiselect(
283
+ "Choose Columns for Ordinal Encoding",
284
+ [""] + list(data.select_dtypes(include="object")),
285
+ )
286
+ st.info("Other columns will be One Hot Encoded.")
287
+
288
+ encode_columns = st.button("Encode Columns")
289
+
290
+ if encode_columns:
291
+ bool_columns = data.select_dtypes(include=bool).columns
292
+ data[bool_columns] = data[bool_columns].astype(int)
293
+ if oe_columns:
294
+ oe = OrdinalEncoder()
295
+ data[oe_columns] = oe.fit_transform(
296
+ data[oe_columns].astype("str")
297
+ )
298
+
299
+ try:
300
+ remaining_cat_cols = [
301
+ col
302
+ for col in data.select_dtypes(include="object")
303
+ if col not in oe_columns
304
+ ]
305
+ except:
306
+ pass
307
+
308
+ if len(remaining_cat_cols) > 0:
309
+ data = pd.get_dummies(
310
+ data, columns=remaining_cat_cols, drop_first=False
311
+ )
312
+ st.success("Encoded categorical columns")
313
+
314
+
315
+ bool_columns = data.select_dtypes(include=bool).columns
316
+ data[bool_columns] = data[bool_columns].astype(int)
317
+ st.session_state["data"] = data
318
+
319
+
320
+
321
+
322
+
323
+ preprocessed_data_csv = data.to_csv(index=False)
324
+ preprocessed_data_buffer = io.StringIO()
325
+ preprocessed_data_buffer.write(preprocessed_data_csv)
326
+ preprocessed_data_bytes = preprocessed_data_buffer.getvalue()
327
+ if st.download_button(
328
+ label="Download Preprocessed Data",
329
+ key="preprocessed_data",
330
+ on_click=None,
331
+ data=preprocessed_data_bytes.encode(),
332
+ file_name="preprocessed_data.csv",
333
+ mime="text/csv",
334
+ ):
335
+ st.success('Data Downloaded')
336
+
337
+
338
+ if option == "Training and Evaluation":
339
+ st.markdown(
340
+ "<h1 style='text-align: center;'>Training and Evaluation</h1>",
341
+ unsafe_allow_html=True,
342
+ )
343
+ algo = st.selectbox("Choose Algorithm Type:", ("", "Regression", "Classification"))
344
+
345
+ if algo == "Regression":
346
+ target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
347
+
348
+ try:
349
+ X = data.drop(target, axis=1)
350
+ Y = data[target]
351
+ except Exception as e:
352
+ st.write(str(e))
353
+
354
+ st.write(
355
+ "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
356
+ )
357
+ X_train, X_test, y_train, y_test = train_test_split(
358
+ X, Y, test_size=0.2, random_state=42
359
+ )
360
+
361
+ scale = st.selectbox(
362
+ "Choose how do you want to scale features:",
363
+ ("", "Standard Scaler", "Min Max Scaler"),
364
+ )
365
+
366
+ if scale == "Standard Scaler":
367
+ scaler = StandardScaler()
368
+ X_train = scaler.fit_transform(X_train)
369
+ X_test = scaler.transform(X_test)
370
+
371
+ elif scale == "Min Max Scaler":
372
+ scaler = MinMaxScaler()
373
+ X_train = scaler.fit_transform(X_train)
374
+ X_test = scaler.transform(X_test)
375
+
376
+ model = st.selectbox(
377
+ "Choose Regression Model for training: ",
378
+ (
379
+ "",
380
+ "Ridge Regression",
381
+ "Decision Tree Regressor",
382
+ "Random Forest Regressor",
383
+ "SVR",
384
+ "XGBRF Regressor",
385
+ "LGBM Regressor",
386
+ ),
387
+ )
388
+
389
+ if model == "Ridge Regression":
390
+ reg = Ridge(alpha=1.0)
391
+ reg.fit(X_train, y_train)
392
+ pred = reg.predict(X_test)
393
+ st.write(
394
+ "Mean Absolute Error (MAE): {:.4f}".format(
395
+ mean_absolute_error(pred, y_test)
396
+ )
397
+ )
398
+ st.write(
399
+ "Mean Squared Error (MSE): {:.4f}".format(
400
+ mean_squared_error(pred, y_test)
401
+ )
402
+ )
403
+ st.write(
404
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
405
+ mean_squared_error(pred, y_test, squared=False)
406
+ )
407
+ )
408
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
409
+
410
+ if st.download_button(
411
+ label="Download Trained Model",
412
+ key="trained_model",
413
+ on_click=None,
414
+ data=pickle.dumps(reg),
415
+ file_name="ridge_regression_model.pkl",
416
+ mime="application/octet-stream",
417
+ ):
418
+ with open("ridge_regression_model.pkl", "wb") as model_file:
419
+ pickle.dump(reg, model_file)
420
+
421
+ elif model == "Decision Tree Regressor":
422
+ reg = DecisionTreeRegressor(max_depth=10)
423
+ reg.fit(X_train, y_train)
424
+ pred = reg.predict(X_test)
425
+ st.write(
426
+ "Mean Absolute Error (MAE): {:.4f}".format(
427
+ mean_absolute_error(pred, y_test)
428
+ )
429
+ )
430
+ st.write(
431
+ "Mean Squared Error (MSE): {:.4f}".format(
432
+ mean_squared_error(pred, y_test)
433
+ )
434
+ )
435
+ st.write(
436
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
437
+ mean_squared_error(pred, y_test, squared=False)
438
+ )
439
+ )
440
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
441
+
442
+ if st.download_button(
443
+ label="Download Trained Model",
444
+ key="trained_model",
445
+ on_click=None,
446
+ data=pickle.dumps(reg),
447
+ file_name="decision_tree_regression_model.pkl",
448
+ mime="application/octet-stream",
449
+ ):
450
+ with open(
451
+ "decision_tree_regression_model.pkl", "wb"
452
+ ) as model_file:
453
+ pickle.dump(reg, model_file)
454
+
455
+ elif model == "Random Forest Regressor":
456
+ reg = RandomForestRegressor(max_depth=10, n_estimators=100)
457
+ reg.fit(X_train, y_train)
458
+ pred = reg.predict(X_test)
459
+ st.write(
460
+ "Mean Absolute Error (MAE): {:.4f}".format(
461
+ mean_absolute_error(pred, y_test)
462
+ )
463
+ )
464
+ st.write(
465
+ "Mean Squared Error (MSE): {:.4f}".format(
466
+ mean_squared_error(pred, y_test)
467
+ )
468
+ )
469
+ st.write(
470
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
471
+ mean_squared_error(pred, y_test, squared=False)
472
+ )
473
+ )
474
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
475
+
476
+ if st.download_button(
477
+ label="Download Trained Model",
478
+ key="trained_model",
479
+ on_click=None,
480
+ data=pickle.dumps(reg),
481
+ file_name="random_forest_regression_model.pkl",
482
+ mime="application/octet-stream",
483
+ ):
484
+ with open(
485
+ "random_forest_regression_model.pkl", "wb"
486
+ ) as model_file:
487
+ pickle.dump(reg, model_file)
488
+
489
+ elif model == "SVR":
490
+ reg = SVR(C=1.0, epsilon=0.2)
491
+ reg.fit(X_train, y_train)
492
+ pred = reg.predict(X_test)
493
+ st.write(
494
+ "Mean Absolute Error (MAE): {:.4f}".format(
495
+ mean_absolute_error(pred, y_test)
496
+ )
497
+ )
498
+ st.write(
499
+ "Mean Squared Error (MSE): {:.4f}".format(
500
+ mean_squared_error(pred, y_test)
501
+ )
502
+ )
503
+ st.write(
504
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
505
+ mean_squared_error(pred, y_test, squared=False)
506
+ )
507
+ )
508
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
509
+
510
+ if st.download_button(
511
+ label="Download Trained Model",
512
+ key="trained_model",
513
+ on_click=None,
514
+ data=pickle.dumps(reg),
515
+ file_name="svr_model.pkl",
516
+ mime="application/octet-stream",
517
+ ):
518
+ with open("svr_model.pkl", "wb") as model_file:
519
+ pickle.dump(reg, model_file)
520
+
521
+ elif model == "XGBRF Regressor":
522
+ reg = XGBRFRegressor(reg_lambda=1)
523
+ reg.fit(X_train, y_train)
524
+ pred = reg.predict(X_test)
525
+ st.write(
526
+ "Mean Absolute Error (MAE): {:.4f}".format(
527
+ mean_absolute_error(pred, y_test)
528
+ )
529
+ )
530
+ st.write(
531
+ "Mean Squared Error (MSE): {:.4f}".format(
532
+ mean_squared_error(pred, y_test)
533
+ )
534
+ )
535
+ st.write(
536
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
537
+ mean_squared_error(pred, y_test, squared=False)
538
+ )
539
+ )
540
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
541
+
542
+ if st.download_button(
543
+ label="Download Trained Model",
544
+ key="trained_model",
545
+ on_click=None,
546
+ data=pickle.dumps(reg),
547
+ file_name="xgbrf_regression_model.pkl",
548
+ mime="application/octet-stream",
549
+ ):
550
+ with open("xgbrf_regression_model.pkl", "wb") as model_file:
551
+ pickle.dump(reg, model_file)
552
+
553
+ elif model == "LGBM Regressor":
554
+ reg = LGBMRegressor(reg_lambda=1)
555
+ reg.fit(X_train, y_train)
556
+ pred = reg.predict(X_test)
557
+ st.write(
558
+ "Mean Absolute Error (MAE): {:.4f}".format(
559
+ mean_absolute_error(pred, y_test)
560
+ )
561
+ )
562
+ st.write(
563
+ "Mean Squared Error (MSE): {:.4f}".format(
564
+ mean_squared_error(pred, y_test)
565
+ )
566
+ )
567
+ st.write(
568
+ "Root Mean Squared Error (RMSE): {:.4f}".format(
569
+ mean_squared_error(pred, y_test, squared=False)
570
+ )
571
+ )
572
+ st.write("R-squared (R²): {:.4f}".format(r2_score(pred, y_test)))
573
+
574
+ if st.download_button(
575
+ label="Download Trained Model",
576
+ key="trained_model",
577
+ on_click=None,
578
+ data=pickle.dumps(reg),
579
+ file_name="lgbm_regression_model.pkl",
580
+ mime="application/octet-stream",
581
+ ):
582
+ with open("lgbm_regression_model.pkl", "wb") as model_file:
583
+ pickle.dump(reg, model_file)
584
+
585
+ elif algo == "Classification":
586
+ target = st.selectbox("Chose Target Variable (Y): ", list(data.columns))
587
+
588
+ try:
589
+ X = data.drop(target, axis=1)
590
+ Y = data[target]
591
+ except Exception as e:
592
+ st.write(str(e))
593
+
594
+ st.write(
595
+ "80% of the data will be used for training the model, rest of 20% data will be used for evaluating the model."
596
+ )
597
+ X_train, X_test, y_train, y_test = train_test_split(
598
+ X, Y, test_size=0.2, random_state=42
599
+ )
600
+
601
+ balance = st.selectbox(
602
+ "Do you want to balance dataset?", ("", "Yes", "No")
603
+ )
604
+ if balance == "Yes":
605
+ piechart(data, target)
606
+
607
+ sample = st.selectbox(
608
+ "Which approach you want to use?",
609
+ ("", "Random Under Sampling", "Random Over Sampling", "SMOTE"),
610
+ )
611
+
612
+ if sample == "Random Under Sampling":
613
+ rus = RandomUnderSampler(random_state=42)
614
+ X_train, y_train = rus.fit_resample(X_train, y_train)
615
+
616
+ elif sample == "Random Over Sampling":
617
+ ros = RandomOverSampler(random_state=42)
618
+ X_train, y_train = ros.fit_resample(X_train, y_train)
619
+
620
+ elif sample == "SMOTE":
621
+ smote = SMOTE(random_state=42)
622
+ X_train, y_train = smote.fit_resample(X_train, y_train)
623
+
624
+ scale = st.selectbox(
625
+ "Choose how do you want to scale features:",
626
+ ("", "Standard Scaler", "Min Max Scaler"),
627
+ )
628
+
629
+
630
+ if scale == "Standard Scaler":
631
+ scaler = StandardScaler()
632
+ X_train = scaler.fit_transform(X_train)
633
+ X_test = scaler.transform(X_test)
634
+
635
+ elif scale == "Min Max Scaler":
636
+ scaler = MinMaxScaler()
637
+ X_train = scaler.fit_transform(X_train)
638
+ X_test = scaler.transform(X_test)
639
+
640
+ model = st.selectbox(
641
+ "Choose Classification Model for training: ",
642
+ (
643
+ "",
644
+ "Logistic Regression",
645
+ "Decision Tree Classifier",
646
+ "Random Forest Classifier",
647
+ "SVC",
648
+ "XGBRF Classifier",
649
+ "LGBM Classifier",
650
+ ),
651
+ )
652
+
653
+ if model == "Logistic Regression":
654
+ clf = LogisticRegression(penalty="l2")
655
+ clf.fit(X_train, y_train)
656
+ pred = clf.predict(X_test)
657
+ st.write(
658
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
659
+ )
660
+
661
+ try:
662
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
663
+ st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
664
+ st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
665
+ except ValueError:
666
+ st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
667
+ st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
668
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
669
+
670
+
671
+ plot_confusion_matrix(
672
+ pred, y_test, "Logistic Regression Confusion Matrix "
673
+ )
674
+
675
+ if st.download_button(
676
+ label="Download Trained Model",
677
+ key="trained_model",
678
+ on_click=None,
679
+ data=pickle.dumps(clf),
680
+ file_name="logistic_regression_model.pkl",
681
+ mime="application/octet-stream",
682
+ ):
683
+ with open("logistic_regression_model.pkl", "wb") as model_file:
684
+ pickle.dump(clf, model_file)
685
+
686
+ if model == "Decision Tree Classifier":
687
+ clf = DecisionTreeClassifier(max_depth=5)
688
+ clf.fit(X_train, y_train)
689
+ pred = clf.predict(X_test)
690
+ st.write(
691
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
692
+ )
693
+ try:
694
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
695
+ st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
696
+ st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
697
+ except ValueError:
698
+ st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
699
+ st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
700
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
701
+
702
+ plot_confusion_matrix(
703
+ pred, y_test, "DecisionTree Classifier Confusion Matrix "
704
+ )
705
+
706
+ if st.download_button(
707
+ label="Download Trained Model",
708
+ key="trained_model",
709
+ on_click=None,
710
+ data=pickle.dumps(clf),
711
+ file_name="decision_tree_classifier_model.pkl",
712
+ mime="application/octet-stream",
713
+ ):
714
+ with open(
715
+ "decision_tree_classifier_model.pkl", "wb"
716
+ ) as model_file:
717
+ pickle.dump(clf, model_file)
718
+
719
+ if model == "Random Forest Classifier":
720
+ clf = RandomForestClassifier(n_estimators=100, max_depth=5)
721
+ clf.fit(X_train, y_train)
722
+ pred = clf.predict(X_test)
723
+ st.write(
724
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
725
+ )
726
+ try:
727
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
728
+ st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
729
+ st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
730
+ except ValueError:
731
+ st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
732
+ st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
733
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
734
+
735
+ plot_confusion_matrix(
736
+ pred, y_test, "RandomForest Classifier Confusion Matrix "
737
+ )
738
+
739
+ if st.download_button(
740
+ label="Download Trained Model",
741
+ key="trained_model",
742
+ on_click=None,
743
+ data=pickle.dumps(clf),
744
+ file_name="random_forest_classifier_model.pkl",
745
+ mime="application/octet-stream",
746
+ ):
747
+ with open(
748
+ "random_forest_classifier_model.pkl", "wb"
749
+ ) as model_file:
750
+ pickle.dump(clf, model_file)
751
+
752
+ if model == "SVC":
753
+ clf = SVC(C=1.5)
754
+ clf.fit(X_train, y_train)
755
+ pred = clf.predict(X_test)
756
+ st.write(
757
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
758
+ )
759
+ try:
760
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
761
+ st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
762
+ st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
763
+ except ValueError:
764
+ st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
765
+ st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
766
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
767
+
768
+
769
+ plot_confusion_matrix(pred, y_test, "SVC Confusion Matrix ")
770
+
771
+ if st.download_button(
772
+ label="Download Trained Model",
773
+ key="trained_model",
774
+ on_click=None,
775
+ data=pickle.dumps(clf),
776
+ file_name="svc_model.pkl",
777
+ mime="application/octet-stream",
778
+ ):
779
+ with open("svc_model.pkl", "wb") as model_file:
780
+ pickle.dump(clf, model_file)
781
+
782
+ if model == "XGBRF Classifier":
783
+ clf = XGBRFClassifier(reg_lambda=1.0)
784
+ clf.fit(X_train, y_train)
785
+ pred = clf.predict(X_test)
786
+ st.write(
787
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
788
+ )
789
+ try:
790
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
791
+ st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
792
+ st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
793
+ except ValueError:
794
+ st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
795
+ st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
796
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
797
+
798
+
799
+ plot_confusion_matrix(
800
+ pred, y_test, "XGBRF Classifier Confusion Matrix "
801
+ )
802
+
803
+ if st.download_button(
804
+ label="Download Trained Model",
805
+ key="trained_model",
806
+ on_click=None,
807
+ data=pickle.dumps(clf),
808
+ file_name="xgbrf_classifier_model.pkl",
809
+ mime="application/octet-stream",
810
+ ):
811
+ with open("xgbrf_classifier_model.pkl", "wb") as model_file:
812
+ pickle.dump(clf, model_file)
813
+
814
+ if model == "LGBM Classifier":
815
+ clf = LGBMClassifier(reg_lambda=1.0)
816
+ clf.fit(X_train, y_train)
817
+ pred = clf.predict(X_test)
818
+ st.write(
819
+ "Accuracy Score: {:.4f}".format(accuracy_score(pred, y_test))
820
+ )
821
+ try:
822
+ st.write("F1 Score: {:.4f}".format(f1_score(pred, y_test)))
823
+ st.write('Precision Score: {:.4f}' .format(precision_score(pred, y_test)))
824
+ st.write('Recall Score: {:.4f}'.format(recall_score(pred, y_test)))
825
+ except ValueError:
826
+ st.write('Macro Precision Score: {:.4f}' .format(precision_score(pred, y_test, average='macro')))
827
+ st.write('Macro Recall Score: {:.4f}'.format(recall_score(pred, y_test, average='macro')))
828
+ st.write("Macro F1 Score: {:.4f}".format(f1_score(pred, y_test, average='macro')))
829
+
830
+ plot_confusion_matrix(
831
+ pred, y_test, "LGBM Classifier Confusion Matrix "
832
+ )
833
+
834
+ if st.download_button(
835
+ label="Download Trained Model",
836
+ key="trained_model",
837
+ on_click=None,
838
+ data=pickle.dumps(clf),
839
+ file_name="lgbm_classifier_model.pkl",
840
+ mime="application/octet-stream",
841
+ ):
842
+ with open("lgbm_classifier_model.pkl", "wb") as model_file:
843
+ pickle.dump(clf, model_file)
844
+
845
+
846
+ def load_csv(file):
847
+ data = pd.read_csv(file)
848
+ return data
849
+
850
+
851
+ def data_overview(data):
852
+ r, c = data.shape
853
+ st.write(f"Number of Rows: {r}")
854
+ return f"Number of Columns: {c}"
855
+
856
+
857
+ def missing_data(data):
858
+ missing_values = data.isna().sum()
859
+ missing_values = missing_values[missing_values > 0]
860
+ missing_value_per = (missing_values / data.shape[0]) * 100
861
+ missing_value_per = missing_value_per.round(2).astype(str) + "%"
862
+ missing_df = pd.DataFrame(
863
+ {"Missing Values": missing_values, "Percentage": missing_value_per}
864
+ )
865
+ missing_df_html = missing_df.to_html(
866
+ classes="table table-striped", justify="center"
867
+ )
868
+ return st.markdown(missing_df_html, unsafe_allow_html=True)
869
+
870
+
871
+ def display_data_info(data):
872
+ dtypes = pd.DataFrame(data.dtypes, columns=["Data Type"])
873
+ dtypes.reset_index(inplace=True)
874
+ nunique = pd.DataFrame(data.nunique(), columns=["Unique Counts"])
875
+ nunique.reset_index(inplace=True)
876
+ dtypes.columns = ["Column", "Data Type"]
877
+ nunique.columns = ["Column", "Unique Counts"]
878
+ combined_df = pd.merge(dtypes, nunique, on="Column")
879
+ combined_df_html = combined_df.to_html(
880
+ classes="table table-striped", justify="center"
881
+ )
882
+ return st.markdown(combined_df_html, unsafe_allow_html=True)
883
+
884
+
885
+ def value_counts(data):
886
+ column = st.selectbox("Select a Column", [""] + list(data.columns))
887
+ if column:
888
+ st.write(data[column].value_counts())
889
+
890
+
891
+ def duplicate(data):
892
+ if data.duplicated().any():
893
+ st.write(
894
+ f"There is/are {data.duplicated().sum()} duplicate rows in the DataFrame. Duplicated values will be dropped."
895
+ )
896
+ data.drop_duplicates(keep="first", inplace=True)
897
+ return ""
898
+
899
+ else:
900
+ return "There are no duplicate rows in the DataFrame."
901
+
902
+ def countplot(data, col):
903
+ plt.figure(figsize=(10, 6))
904
+ sns.countplot(y=data[col], palette=palette[1:], edgecolor="#1c1c1c", linewidth=2)
905
+ plt.title(f"Countplot of {col} Column")
906
+ st.pyplot(plt)
907
+
908
+
909
+ def piechart(data, col):
910
+ value_counts = data[col].value_counts()
911
+ plt.figure(figsize=(8, 6))
912
+ plt.pie(
913
+ value_counts,
914
+ labels=value_counts.index,
915
+ autopct="%1.1f%%",
916
+ colors=palette,
917
+ shadow=False,
918
+ wedgeprops=dict(edgecolor="#1c1c1c"),
919
+ )
920
+ plt.title(f"Pie Chart of {col} Column")
921
+ st.pyplot(plt)
922
+
923
+
924
+ def histogram(data, col):
925
+ plt.figure(figsize=(10, 6))
926
+ sns.histplot(
927
+ data[col],
928
+ kde=True,
929
+ color=palette[4],
930
+ fill=True,
931
+ edgecolor="#1c1c1c",
932
+ linewidth=2,
933
+ )
934
+ plt.title(f"Histogram of {col} Column")
935
+ st.pyplot(plt)
936
+
937
+
938
+ def violinplot(data, col):
939
+ plt.figure(figsize=(10, 6))
940
+ sns.violinplot(data[col], color=palette[8])
941
+ plt.title(f"Violin Plot of {col} Column")
942
+ st.pyplot(plt)
943
+
944
+
945
+ def scatterplot(data, col):
946
+ plt.figure(figsize=(10, 8))
947
+ sns.scatterplot(data[col], color=palette[3])
948
+ plt.title(f"Scatter Plot of {col} Column")
949
+ st.pyplot(plt)
950
+
951
+
952
+ def biscatterplot(data, cols):
953
+ try:
954
+ plt.figure(figsize=(10, 8))
955
+ sns.scatterplot(
956
+ data=data,
957
+ x=cols[0],
958
+ y=cols[1],
959
+ palette=palette[1:],
960
+ edgecolor="#1c1c1c",
961
+ linewidth=2,
962
+ )
963
+ plt.title(f"Scatter Plot of {cols[0]} and {cols[1]} Columns")
964
+ st.pyplot(plt)
965
+ except Exception as e:
966
+ st.write(str(e))
967
+
968
+
969
+ def bibarplot(data, cols):
970
+ try:
971
+ plt.figure(figsize=(10, 8))
972
+ sns.barplot(
973
+ data=data,
974
+ x=cols[0],
975
+ y=cols[1],
976
+ palette=palette[1:],
977
+ edgecolor="#1c1c1c",
978
+ linewidth=2,
979
+ )
980
+ plt.title(f"Bar Plot of {cols[0]} and {cols[1]} Columns")
981
+ st.pyplot(plt)
982
+ except Exception as e:
983
+ st.write(str(e))
984
+
985
+
986
+ def biboxplot(data, cols):
987
+ try:
988
+ plt.figure(figsize=(10, 8))
989
+ sns.boxplot(data=data, x=cols[0], y=cols[1], palette=palette[1:], linewidth=2)
990
+ plt.title(f"Box Plot of {cols[0]} and {cols[1]} Columns")
991
+ st.pyplot(plt)
992
+ except Exception as e:
993
+ st.write(str(e))
994
+
995
+
996
+ def paretoplot(data, categorical_col):
997
+ try:
998
+ value_counts = data[categorical_col].value_counts()
999
+ cumulative_percentage = (value_counts / value_counts.sum()).cumsum()
1000
+ pareto_df = pd.DataFrame(
1001
+ {
1002
+ "Categories": value_counts.index,
1003
+ "Frequency": value_counts.values,
1004
+ "Cumulative Percentage": cumulative_percentage.values * 100,
1005
+ }
1006
+ )
1007
+ pareto_df = pareto_df.sort_values(by="Frequency", ascending=False)
1008
+
1009
+ fig, ax1 = plt.subplots(figsize=(10, 8))
1010
+ ax1.bar(
1011
+ pareto_df["Categories"],
1012
+ pareto_df["Frequency"],
1013
+ color=palette[1:],
1014
+ edgecolor="#1c1c1c",
1015
+ linewidth=2,
1016
+ )
1017
+ ax2 = ax1.twinx()
1018
+ ax2.yaxis.set_major_formatter(PercentFormatter())
1019
+ ax2.plot(
1020
+ pareto_df["Categories"],
1021
+ pareto_df["Cumulative Percentage"],
1022
+ color=palette[3],
1023
+ marker="D",
1024
+ ms=10,
1025
+ )
1026
+ ax1.set_xlabel(categorical_col)
1027
+ ax1.set_ylabel("Frequency", color=palette[0])
1028
+ ax2.set_ylabel("Cumulative Percentage", color=palette[3])
1029
+ st.pyplot(fig)
1030
+
1031
+ except Exception as e:
1032
+ pass
1033
+
1034
+
1035
+ def plot_confusion_matrix(y_true, y_pred, title):
1036
+ cm = confusion_matrix(y_true, y_pred)
1037
+ plt.figure(figsize=(6, 4))
1038
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
1039
+ plt.xlabel("Predicted Label")
1040
+ plt.ylabel("True Label")
1041
+ plt.title(title)
1042
+ st.pyplot(plt)
1043
+
1044
+
1045
+ if __name__ == "__main__":
1046
+ main()