{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d4c303ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.simplefilter(\"ignore\")\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.metrics import classification_report, confusion_matrix\n",
    "from sklearn.model_selection import train_test_split\n",
    "import xgboost as xgb\n",
    "from sklearn.preprocessing import LabelEncoder\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "4e15af5f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENDER</th>\n",
       "      <th>AGE</th>\n",
       "      <th>SMOKING</th>\n",
       "      <th>YELLOW_FINGERS</th>\n",
       "      <th>ANXIETY</th>\n",
       "      <th>PEER_PRESSURE</th>\n",
       "      <th>CHRONIC DISEASE</th>\n",
       "      <th>FATIGUE</th>\n",
       "      <th>ALLERGY</th>\n",
       "      <th>WHEEZING</th>\n",
       "      <th>ALCOHOL CONSUMING</th>\n",
       "      <th>COUGHING</th>\n",
       "      <th>SHORTNESS OF BREATH</th>\n",
       "      <th>SWALLOWING DIFFICULTY</th>\n",
       "      <th>CHEST PAIN</th>\n",
       "      <th>LUNG_CANCER</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>M</td>\n",
       "      <td>69</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>YES</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>M</td>\n",
       "      <td>74</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>YES</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>F</td>\n",
       "      <td>59</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>NO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>M</td>\n",
       "      <td>63</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>NO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>F</td>\n",
       "      <td>63</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \\\n",
       "0      M   69        1               2        2              1   \n",
       "1      M   74        2               1        1              1   \n",
       "2      F   59        1               1        1              2   \n",
       "3      M   63        2               2        2              1   \n",
       "4      F   63        1               2        1              1   \n",
       "\n",
       "   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \\\n",
       "0                1         2         1         2                  2         2   \n",
       "1                2         2         2         1                  1         1   \n",
       "2                1         2         1         2                  1         2   \n",
       "3                1         1         1         1                  2         1   \n",
       "4                1         1         1         2                  1         2   \n",
       "\n",
       "   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN LUNG_CANCER  \n",
       "0                    2                      2           2         YES  \n",
       "1                    2                      2           2         YES  \n",
       "2                    2                      1           2          NO  \n",
       "3                    1                      2           2          NO  \n",
       "4                    2                      1           1          NO  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lung_data = pd.read_csv(r'C:\\Users\\elegb\\Desktop\\pdf\\survey lung cancer.csv')\n",
    "lung_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9abe8af8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 309 entries, 0 to 308\n",
      "Data columns (total 16 columns):\n",
      " #   Column                 Non-Null Count  Dtype \n",
      "---  ------                 --------------  ----- \n",
      " 0   GENDER                 309 non-null    object\n",
      " 1   AGE                    309 non-null    int64 \n",
      " 2   SMOKING                309 non-null    int64 \n",
      " 3   YELLOW_FINGERS         309 non-null    int64 \n",
      " 4   ANXIETY                309 non-null    int64 \n",
      " 5   PEER_PRESSURE          309 non-null    int64 \n",
      " 6   CHRONIC DISEASE        309 non-null    int64 \n",
      " 7   FATIGUE                309 non-null    int64 \n",
      " 8   ALLERGY                309 non-null    int64 \n",
      " 9   WHEEZING               309 non-null    int64 \n",
      " 10  ALCOHOL CONSUMING      309 non-null    int64 \n",
      " 11  COUGHING               309 non-null    int64 \n",
      " 12  SHORTNESS OF BREATH    309 non-null    int64 \n",
      " 13  SWALLOWING DIFFICULTY  309 non-null    int64 \n",
      " 14  CHEST PAIN             309 non-null    int64 \n",
      " 15  LUNG_CANCER            309 non-null    object\n",
      "dtypes: int64(14), object(2)\n",
      "memory usage: 38.8+ KB\n"
     ]
    }
   ],
   "source": [
    "lung_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "3dbb3974",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 276 entries, 0 to 283\n",
      "Data columns (total 16 columns):\n",
      " #   Column                 Non-Null Count  Dtype \n",
      "---  ------                 --------------  ----- \n",
      " 0   GENDER                 276 non-null    object\n",
      " 1   AGE                    276 non-null    int64 \n",
      " 2   SMOKING                276 non-null    int64 \n",
      " 3   YELLOW_FINGERS         276 non-null    int64 \n",
      " 4   ANXIETY                276 non-null    int64 \n",
      " 5   PEER_PRESSURE          276 non-null    int64 \n",
      " 6   CHRONIC DISEASE        276 non-null    int64 \n",
      " 7   FATIGUE                276 non-null    int64 \n",
      " 8   ALLERGY                276 non-null    int64 \n",
      " 9   WHEEZING               276 non-null    int64 \n",
      " 10  ALCOHOL CONSUMING      276 non-null    int64 \n",
      " 11  COUGHING               276 non-null    int64 \n",
      " 12  SHORTNESS OF BREATH    276 non-null    int64 \n",
      " 13  SWALLOWING DIFFICULTY  276 non-null    int64 \n",
      " 14  CHEST PAIN             276 non-null    int64 \n",
      " 15  LUNG_CANCER            276 non-null    object\n",
      "dtypes: int64(14), object(2)\n",
      "memory usage: 36.7+ KB\n"
     ]
    }
   ],
   "source": [
    "lung_data = lung_data.drop_duplicates()\n",
    "lung_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2c09b012",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENDER</th>\n",
       "      <th>SMOKING</th>\n",
       "      <th>YELLOW_FINGERS</th>\n",
       "      <th>ANXIETY</th>\n",
       "      <th>PEER_PRESSURE</th>\n",
       "      <th>CHRONIC DISEASE</th>\n",
       "      <th>FATIGUE</th>\n",
       "      <th>ALLERGY</th>\n",
       "      <th>WHEEZING</th>\n",
       "      <th>ALCOHOL CONSUMING</th>\n",
       "      <th>COUGHING</th>\n",
       "      <th>SHORTNESS OF BREATH</th>\n",
       "      <th>SWALLOWING DIFFICULTY</th>\n",
       "      <th>CHEST PAIN</th>\n",
       "      <th>LUNG_CANCER</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  GENDER SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE CHRONIC DISEASE  \\\n",
       "0      1       0              1       1             0               0   \n",
       "1      1       1              0       0             0               1   \n",
       "2      0       0              0       0             1               0   \n",
       "3      1       1              1       1             0               0   \n",
       "4      0       0              1       0             0               0   \n",
       "\n",
       "  FATIGUE  ALLERGY  WHEEZING ALCOHOL CONSUMING COUGHING SHORTNESS OF BREATH  \\\n",
       "0        1        0        1                 1        1                   1   \n",
       "1        1        1        0                 0        0                   1   \n",
       "2        1        0        1                 0        1                   1   \n",
       "3        0        0        0                 1        0                   0   \n",
       "4        0        0        1                 0        1                   1   \n",
       "\n",
       "  SWALLOWING DIFFICULTY CHEST PAIN LUNG_CANCER  \n",
       "0                     1          1           1  \n",
       "1                     1          1           1  \n",
       "2                     0          1           0  \n",
       "3                     1          1           0  \n",
       "4                     0          0           0  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "categorical = lung_data.drop(['AGE'], axis = 1)\n",
    "encoder = LabelEncoder()\n",
    "for col in categorical.columns:\n",
    "    categorical[col] = encoder.fit_transform(categorical[col])\n",
    "\n",
    "categorical = categorical.astype(\"category\")  \n",
    "categorical.head()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b15e78ca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENDER</th>\n",
       "      <th>SMOKING</th>\n",
       "      <th>YELLOW_FINGERS</th>\n",
       "      <th>ANXIETY</th>\n",
       "      <th>PEER_PRESSURE</th>\n",
       "      <th>CHRONIC DISEASE</th>\n",
       "      <th>FATIGUE</th>\n",
       "      <th>ALLERGY</th>\n",
       "      <th>WHEEZING</th>\n",
       "      <th>ALCOHOL CONSUMING</th>\n",
       "      <th>COUGHING</th>\n",
       "      <th>SHORTNESS OF BREATH</th>\n",
       "      <th>SWALLOWING DIFFICULTY</th>\n",
       "      <th>CHEST PAIN</th>\n",
       "      <th>LUNG_CANCER</th>\n",
       "      <th>AGE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>69</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>63</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  GENDER SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE CHRONIC DISEASE  \\\n",
       "0      1       0              1       1             0               0   \n",
       "1      1       1              0       0             0               1   \n",
       "2      0       0              0       0             1               0   \n",
       "3      1       1              1       1             0               0   \n",
       "4      0       0              1       0             0               0   \n",
       "\n",
       "  FATIGUE  ALLERGY  WHEEZING ALCOHOL CONSUMING COUGHING SHORTNESS OF BREATH  \\\n",
       "0        1        0        1                 1        1                   1   \n",
       "1        1        1        0                 0        0                   1   \n",
       "2        1        0        1                 0        1                   1   \n",
       "3        0        0        0                 1        0                   0   \n",
       "4        0        0        1                 0        1                   1   \n",
       "\n",
       "  SWALLOWING DIFFICULTY CHEST PAIN LUNG_CANCER  AGE  \n",
       "0                     1          1           1   69  \n",
       "1                     1          1           1   74  \n",
       "2                     0          1           0   59  \n",
       "3                     1          1           0   63  \n",
       "4                     0          0           0   63  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lung_data = pd.concat([categorical, lung_data['AGE']], axis = 1)\n",
    "lung_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8925150b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    238\n",
       "0     38\n",
       "Name: LUNG_CANCER, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lung_data.LUNG_CANCER.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "c992c376",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1000 entries, 183 to 87\n",
      "Data columns (total 16 columns):\n",
      " #   Column                 Non-Null Count  Dtype   \n",
      "---  ------                 --------------  -----   \n",
      " 0   GENDER                 1000 non-null   category\n",
      " 1   SMOKING                1000 non-null   category\n",
      " 2   YELLOW_FINGERS         1000 non-null   category\n",
      " 3   ANXIETY                1000 non-null   category\n",
      " 4   PEER_PRESSURE          1000 non-null   category\n",
      " 5   CHRONIC DISEASE        1000 non-null   category\n",
      " 6   FATIGUE                1000 non-null   category\n",
      " 7   ALLERGY                1000 non-null   category\n",
      " 8   WHEEZING               1000 non-null   category\n",
      " 9   ALCOHOL CONSUMING      1000 non-null   category\n",
      " 10  COUGHING               1000 non-null   category\n",
      " 11  SHORTNESS OF BREATH    1000 non-null   category\n",
      " 12  SWALLOWING DIFFICULTY  1000 non-null   category\n",
      " 13  CHEST PAIN             1000 non-null   category\n",
      " 14  LUNG_CANCER            1000 non-null   category\n",
      " 15  AGE                    1000 non-null   int64   \n",
      "dtypes: category(15), int64(1)\n",
      "memory usage: 32.1 KB\n"
     ]
    }
   ],
   "source": [
    "class_0 = lung_data[lung_data['LUNG_CANCER'] == 0]\n",
    "class_1 = lung_data[lung_data['LUNG_CANCER'] == 1]\n",
    "class_1 = class_1.sample(n = 500, replace = True)\n",
    "class_0 = class_0.sample(n = 500, replace = True)\n",
    "lung_data = pd.concat([class_0, class_1], axis = 0)\n",
    "lung_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ebc06c8e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    500\n",
       "1    500\n",
       "Name: LUNG_CANCER, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lung_data['LUNG_CANCER'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b2ca517b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENDER</th>\n",
       "      <th>SMOKING</th>\n",
       "      <th>YELLOW_FINGERS</th>\n",
       "      <th>ANXIETY</th>\n",
       "      <th>PEER_PRESSURE</th>\n",
       "      <th>CHRONIC DISEASE</th>\n",
       "      <th>FATIGUE</th>\n",
       "      <th>ALLERGY</th>\n",
       "      <th>WHEEZING</th>\n",
       "      <th>ALCOHOL CONSUMING</th>\n",
       "      <th>COUGHING</th>\n",
       "      <th>SHORTNESS OF BREATH</th>\n",
       "      <th>SWALLOWING DIFFICULTY</th>\n",
       "      <th>CHEST PAIN</th>\n",
       "      <th>LUNG_CANCER</th>\n",
       "      <th>AGE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>183</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>71</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>69</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>68</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    GENDER SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE CHRONIC DISEASE  \\\n",
       "183      0       1              0       0             0               1   \n",
       "4        0       0              1       0             0               0   \n",
       "37       0       0              0       0             0               1   \n",
       "14       1       1              0       0             0               0   \n",
       "8        0       1              0       1             0               0   \n",
       "\n",
       "    FATIGUE  ALLERGY  WHEEZING ALCOHOL CONSUMING COUGHING SHORTNESS OF BREATH  \\\n",
       "183        1        0        0                 0        0                   1   \n",
       "4          0        0        1                 0        1                   1   \n",
       "37         0        0        1                 0        0                   1   \n",
       "14         0        1        1                 1        1                   0   \n",
       "8          1        0        0                 0        0                   0   \n",
       "\n",
       "    SWALLOWING DIFFICULTY CHEST PAIN LUNG_CANCER  AGE  \n",
       "183                     0          0           0   71  \n",
       "4                       0          0           0   63  \n",
       "37                      1          0           0   56  \n",
       "14                      0          1           0   69  \n",
       "8                       0          0           0   68  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lung_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "62696544",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = lung_data.drop('LUNG_CANCER', axis =1)\n",
    "y = lung_data.LUNG_CANCER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6e4572a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1    80\n",
      "0    70\n",
      "Name: LUNG_CANCER, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.15, random_state = 42)\n",
    "\n",
    "print(y_test.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1b243af1",
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_columns = ['GENDER', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE',\n",
    "    'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',\n",
    "    'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',\n",
    "    'SWALLOWING DIFFICULTY', 'CHEST PAIN',\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "d0a038fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.95      1.00      0.97        70\n",
      "           1       1.00      0.95      0.97        80\n",
      "\n",
      "    accuracy                           0.97       150\n",
      "   macro avg       0.97      0.97      0.97       150\n",
      "weighted avg       0.97      0.97      0.97       150\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[70,  0],\n",
       "       [ 4, 76]], dtype=int64)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create DMatrix for training and test sets with categorical features enabled\n",
    "X_train = X_train.astype({col: \"category\" for col in categorical_columns})\n",
    "\n",
    "dtrain = xgb.DMatrix(X_train , label=y_train, enable_categorical=True)\n",
    "dtest = xgb.DMatrix(X_test, enable_categorical=True)\n",
    "\n",
    "# set parameters for XGBoost classifier\n",
    "\n",
    "params = {\n",
    "    'objective': 'binary:logistic',\n",
    "    'max_depth':3, \n",
    "    'eta':1, \n",
    "    'nthread': 3,\n",
    "    'eval_metric': 'auc',\n",
    "    'learning_rate': 1\n",
    "}\n",
    "\n",
    "# train model\n",
    "model = xgb.train(params, dtrain, num_boost_round=100)\n",
    "# make predictions on test data\n",
    "y_pred = model.predict(dtest)\n",
    "\n",
    "# convert probabilities to binary predictions\n",
    "y_pred_binary = [1 if p >= 0.99 else 0 for p in y_pred]\n",
    "\n",
    "# evaluate model performance\n",
    "print(classification_report(y_test, y_pred_binary))\n",
    "confusion_matrix(y_test, y_pred_binary)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d74abb37",
   "metadata": {},
   "source": [
    "#### Lets assume i want to make predictions  with that of a new patient coming in"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "cd579274",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>GENDER</th>\n",
       "      <th>SMOKING</th>\n",
       "      <th>YELLOW_FINGERS</th>\n",
       "      <th>ANXIETY</th>\n",
       "      <th>PEER_PRESSURE</th>\n",
       "      <th>CHRONIC DISEASE</th>\n",
       "      <th>FATIGUE</th>\n",
       "      <th>ALLERGY</th>\n",
       "      <th>WHEEZING</th>\n",
       "      <th>ALCOHOL CONSUMING</th>\n",
       "      <th>COUGHING</th>\n",
       "      <th>SHORTNESS OF BREATH</th>\n",
       "      <th>SWALLOWING DIFFICULTY</th>\n",
       "      <th>CHEST PAIN</th>\n",
       "      <th>AGE</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   GENDER  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  CHRONIC DISEASE  \\\n",
       "0       1        0               0        1              0                0   \n",
       "\n",
       "   FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \\\n",
       "0         0         0         0                  0         1   \n",
       "\n",
       "   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN  AGE  \n",
       "0                    0                      0           1   25  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_dict = {'GENDER': 1, 'SMOKING': 0, 'YELLOW_FINGERS':0,\n",
    "                  'ANXIETY': 1, 'PEER_PRESSURE': 0,\n",
    "                  'CHRONIC DISEASE': 0, 'FATIGUE ': 0,\n",
    "                  'ALLERGY ': 0, 'WHEEZING': 0,\n",
    "                  'ALCOHOL CONSUMING': 0, 'COUGHING': 1,\n",
    "                  'SHORTNESS OF BREATH': 0,\n",
    "                  'SWALLOWING DIFFICULTY': 0,\n",
    "                  'CHEST PAIN': 1, 'AGE': 25}\n",
    "input_df = pd.DataFrame.from_dict([input_dict])\n",
    "input_df.astype({col: \"category\" for col in categorical_columns})\n",
    "input_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "0f6dc7eb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1 entries, 0 to 0\n",
      "Data columns (total 15 columns):\n",
      " #   Column                 Non-Null Count  Dtype\n",
      "---  ------                 --------------  -----\n",
      " 0   GENDER                 1 non-null      int64\n",
      " 1   SMOKING                1 non-null      int64\n",
      " 2   YELLOW_FINGERS         1 non-null      int64\n",
      " 3   ANXIETY                1 non-null      int64\n",
      " 4   PEER_PRESSURE          1 non-null      int64\n",
      " 5   CHRONIC DISEASE        1 non-null      int64\n",
      " 6   FATIGUE                1 non-null      int64\n",
      " 7   ALLERGY                1 non-null      int64\n",
      " 8   WHEEZING               1 non-null      int64\n",
      " 9   ALCOHOL CONSUMING      1 non-null      int64\n",
      " 10  COUGHING               1 non-null      int64\n",
      " 11  SHORTNESS OF BREATH    1 non-null      int64\n",
      " 12  SWALLOWING DIFFICULTY  1 non-null      int64\n",
      " 13  CHEST PAIN             1 non-null      int64\n",
      " 14  AGE                    1 non-null      int64\n",
      "dtypes: int64(15)\n",
      "memory usage: 248.0 bytes\n"
     ]
    }
   ],
   "source": [
    "input_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "afb5ec1e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.20313403], dtype=float32)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "categorical_columns = ['GENDER', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE',\n",
    "    'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',\n",
    "    'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',\n",
    "    'SWALLOWING DIFFICULTY', 'CHEST PAIN',\n",
    "]\n",
    "\n",
    "\n",
    "dtest = xgb.DMatrix(input_df)\n",
    "\n",
    "prediction = model.predict(dtest)\n",
    "prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91746991",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "728ee97e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "fff3df16",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running on local URL:  http://127.0.0.1:7860\n",
      "Running on public URL: https://de43ecf59e54e5afd9.gradio.live\n",
      "\n",
      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces\n"
     ]
    },
    {
     "data": {
      "text/plain": []
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import gradio as gr\n",
    "\n",
    "# Define the Gradio input and output interfaces\n",
    "inputs = [\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Gender\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you smoke?\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Yellow Fingers\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Anxiety\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you get influenced by Peer Pressure\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have any Chronic Disease\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Fatigue\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have an Allergy\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you experience Wheezing\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you drink alcohol\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Are you Coughing\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Shortness of Breath\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Swallowing Difficulty\"),\n",
    "    gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Chest Pain\"),\n",
    "    gr.inputs.Number(label='Age')\n",
    "]\n",
    "\n",
    "output = gr.outputs.Label(num_top_classes=2)\n",
    "\n",
    "# Define the predict function\n",
    "def predict(gender, smoking, yellow_fingers, anxiety, peer_pressure,\n",
    "            chronic_disease, fatigue, allergy, wheezing, alcohol_consuming,\n",
    "            coughing, shortness_of_breath, swallowing_difficulty, chest_pain,\n",
    "            age):\n",
    "    # Create a dataframe with the input values\n",
    "    input_dict = {'GENDER': gender, 'SMOKING': smoking, 'YELLOW_FINGERS': yellow_fingers,\n",
    "                  'ANXIETY': anxiety, 'PEER_PRESSURE': peer_pressure,\n",
    "                  'CHRONIC DISEASE': chronic_disease, 'FATIGUE ': fatigue,\n",
    "                  'ALLERGY ': allergy, 'WHEEZING': wheezing,\n",
    "                  'ALCOHOL CONSUMING': alcohol_consuming, 'COUGHING': coughing,\n",
    "                  'SHORTNESS OF BREATH': shortness_of_breath,\n",
    "                  'SWALLOWING DIFFICULTY': swallowing_difficulty,\n",
    "                  'CHEST PAIN': chest_pain, 'AGE': age}\n",
    "    input_df = pd.DataFrame.from_dict([input_dict]).astype(\"int\")\n",
    "    \n",
    "    dtest = xgb.DMatrix(input_df)\n",
    "    \n",
    "    \n",
    "    #make predictions\n",
    "    prediction = model.predict(dtest)\n",
    "    \n",
    "    # Return prediction\n",
    "    return \"You have Lung Cancer, you might want to see the Doctor.\" if prediction >0.99 else \"You don't have Lung Cancer, Enjoy❤\"\n",
    "\n",
    "# Create and launch the interface\n",
    "interface = gr.Interface(fn=predict, inputs=inputs, outputs=output, \n",
    "                     title='Lung Cancer Prediction', description='Predicting lung cancer using XGBoost Classifier.\\nPlease Note:\\nFemale = 0, Male= 1\\nNo = 0, Yes = 1')\n",
    "interface.launch(auth = ('user', 'atom'), share = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de27fbe1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}