{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Deployment Draft" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import libraries\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeAttritionBusinessTravelDailyRateDepartmentDistanceFromHomeEducationEducationFieldEnvironmentSatisfactionGender...PerformanceRatingRelationshipSatisfactionStockOptionLevelTotalWorkingYearsTrainingTimesLastYearWorkLifeBalanceYearsAtCompanyYearsInCurrentRoleYearsSinceLastPromotionYearsWithCurrManager
039030.318958231521...321732161109
129020.8459301213321...3318322535
240020.153782211521...31124220000
324020.785534163142...3314027806
444021.000000155511...3410230512
\n", "

5 rows × 30 columns

\n", "
" ], "text/plain": [ " Age Attrition BusinessTravel DailyRate Department DistanceFromHome \\\n", "0 39 0 3 0.318958 2 3 \n", "1 29 0 2 0.845930 1 21 \n", "2 40 0 2 0.153782 2 1 \n", "3 24 0 2 0.785534 1 6 \n", "4 44 0 2 1.000000 1 5 \n", "\n", " Education EducationField EnvironmentSatisfaction Gender ... \\\n", "0 1 5 2 1 ... \n", "1 3 3 2 1 ... \n", "2 1 5 2 1 ... \n", "3 3 1 4 2 ... \n", "4 5 5 1 1 ... \n", "\n", " PerformanceRating RelationshipSatisfaction StockOptionLevel \\\n", "0 3 2 1 \n", "1 3 3 1 \n", "2 3 1 1 \n", "3 3 3 1 \n", "4 3 4 1 \n", "\n", " TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany \\\n", "0 7 3 2 16 \n", "1 8 3 2 2 \n", "2 24 2 2 0 \n", "3 4 0 2 7 \n", "4 0 2 3 0 \n", "\n", " YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager \n", "0 11 0 9 \n", "1 5 3 5 \n", "2 0 0 0 \n", "3 8 0 6 \n", "4 5 1 2 \n", "\n", "[5 rows x 30 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load the data\n", "data = pd.read_csv('data/processed_data.csv')\n", "\n", "# preview the data\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# sampling data\n", "# sample 10% of the data and save as sample_data.csv\n", "sample_data = data.sample(frac=0.1, random_state=1)\n", "\n", "# remove the Attrition column\n", "sample_data = sample_data.drop(columns='Attrition')\n", "sample_data.to_csv('data/sample_data.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
       "              interaction_constraints=None, learning_rate=None, max_bin=None,\n",
       "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
       "              max_delta_step=None, max_depth=4, max_leaves=None,\n",
       "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
       "              n_estimators=250, n_jobs=None, num_parallel_tree=None,\n",
       "              predictor=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=None, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=4, max_leaves=None,\n", " min_child_weight=None, missing=nan, monotone_constraints=None,\n", " n_estimators=250, n_jobs=None, num_parallel_tree=None,\n", " predictor=None, random_state=None, ...)" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# train a xgboost model\n", "from xgboost import XGBClassifier\n", "\n", "# the target is the Attrition\n", "y = data['Attrition']\n", "x = data.drop(['Attrition'], axis=1)\n", "\n", "# train the model\n", "model = XGBClassifier()\n", "\n", "# parameter tuning\n", "# from sklearn.model_selection import GridSearchCV\n", "\n", "# # more in depth search\n", "# param_grid = {\n", "# 'n_estimators': [100, 200, 300],\n", "# 'max_depth': [2, 3, 4],\n", "# 'learning_rate': [0.1, 0.01, 0.001]\n", "# }\n", "\n", "# grid_search = GridSearchCV(model, param_grid, cv=10, n_jobs=-1)\n", "# grid_search.fit(x_train, y_train)\n", "\n", "# # best parameters\n", "# print(grid_search.best_params_)\n", "\n", "# train the model with the best parameters\n", "model = XGBClassifier(n_estimators=250, max_depth=4)\n", "\n", "# fit the model\n", "model.fit(x, y)" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 1.0\n", "Confusion matrix: \n", "[[507 0]\n", " [ 0 493]]\n" ] } ], "source": [ "# test the model\n", "y_pred = model.predict(x)\n", "\n", "# evaluate the model\n", "from sklearn.metrics import accuracy_score\n", "accuracy = accuracy_score(y, y_pred)\n", "print(f'Accuracy: {accuracy}')\n", "\n", "# confusion matrix\n", "from sklearn.metrics import confusion_matrix\n", "conf_matrix = confusion_matrix(y, y_pred)\n", "print(f'Confusion matrix: \\n{conf_matrix}')" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['model/model.pkl']" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# save the model as pkl\n", "import joblib\n", "joblib.dump(model, 'model/model.pkl')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }