## Data Deployment Draft

In [1]:
# import libraries
import pandas as pd

In [2]:
# load the data
data = pd.read_csv('data/processed_data.csv')

# preview the data
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,39,0,3,0.318958,2,3,1,5,2,1,...,3,2,1,7,3,2,16,11,0,9
1,29,0,2,0.84593,1,21,3,3,2,1,...,3,3,1,8,3,2,2,5,3,5
2,40,0,2,0.153782,2,1,1,5,2,1,...,3,1,1,24,2,2,0,0,0,0
3,24,0,2,0.785534,1,6,3,1,4,2,...,3,3,1,4,0,2,7,8,0,6
4,44,0,2,1.0,1,5,5,5,1,1,...,3,4,1,0,2,3,0,5,1,2


In [3]:
# sampling data
# sample 10% of the data and save as sample_data.csv
sample_data = data.sample(frac=0.1, random_state=1)

# remove the Attrition column
sample_data = sample_data.drop(columns='Attrition')
sample_data.to_csv('data/sample_data.csv', index=False)

In [100]:
# train a xgboost model
from xgboost import XGBClassifier

# the target is the Attrition
y = data['Attrition']
x = data.drop(['Attrition'], axis=1)

# train the model
model = XGBClassifier()

# parameter tuning
# from sklearn.model_selection import GridSearchCV

# # more in depth search
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [2, 3, 4],
#     'learning_rate': [0.1, 0.01, 0.001]
# }

# grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1)
# grid_search.fit(x_train, y_train)

# # best parameters
# print(grid_search.best_params_)

# train the model with the best parameters
model = XGBClassifier(n_estimators=250, max_depth=4)

# fit the model
model.fit(x, y)

In [101]:
# test the model
y_pred = model.predict(x)

# evaluate the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y, y_pred)
print(f'Accuracy: {accuracy}')

# confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y, y_pred)
print(f'Confusion matrix: \n{conf_matrix}')

Accuracy: 1.0
Confusion matrix: 
[[507   0]
 [  0 493]]


In [102]:
# save the model as pkl
import joblib
joblib.dump(model, 'model/model.pkl')

['model/model.pkl']