{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d4c303ef",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.simplefilter(\"ignore\")\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"from sklearn.model_selection import train_test_split\n",
"import xgboost as xgb\n",
"from sklearn.preprocessing import LabelEncoder\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4e15af5f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GENDER | \n",
" AGE | \n",
" SMOKING | \n",
" YELLOW_FINGERS | \n",
" ANXIETY | \n",
" PEER_PRESSURE | \n",
" CHRONIC DISEASE | \n",
" FATIGUE | \n",
" ALLERGY | \n",
" WHEEZING | \n",
" ALCOHOL CONSUMING | \n",
" COUGHING | \n",
" SHORTNESS OF BREATH | \n",
" SWALLOWING DIFFICULTY | \n",
" CHEST PAIN | \n",
" LUNG_CANCER | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" M | \n",
" 69 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" YES | \n",
"
\n",
" \n",
" 1 | \n",
" M | \n",
" 74 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" YES | \n",
"
\n",
" \n",
" 2 | \n",
" F | \n",
" 59 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
" NO | \n",
"
\n",
" \n",
" 3 | \n",
" M | \n",
" 63 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" NO | \n",
"
\n",
" \n",
" 4 | \n",
" F | \n",
" 63 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" NO | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GENDER AGE SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE \\\n",
"0 M 69 1 2 2 1 \n",
"1 M 74 2 1 1 1 \n",
"2 F 59 1 1 1 2 \n",
"3 M 63 2 2 2 1 \n",
"4 F 63 1 2 1 1 \n",
"\n",
" CHRONIC DISEASE FATIGUE ALLERGY WHEEZING ALCOHOL CONSUMING COUGHING \\\n",
"0 1 2 1 2 2 2 \n",
"1 2 2 2 1 1 1 \n",
"2 1 2 1 2 1 2 \n",
"3 1 1 1 1 2 1 \n",
"4 1 1 1 2 1 2 \n",
"\n",
" SHORTNESS OF BREATH SWALLOWING DIFFICULTY CHEST PAIN LUNG_CANCER \n",
"0 2 2 2 YES \n",
"1 2 2 2 YES \n",
"2 2 1 2 NO \n",
"3 1 2 2 NO \n",
"4 2 1 1 NO "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lung_data = pd.read_csv(r'C:\\Users\\elegb\\Desktop\\pdf\\survey lung cancer.csv')\n",
"lung_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9abe8af8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 309 entries, 0 to 308\n",
"Data columns (total 16 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 GENDER 309 non-null object\n",
" 1 AGE 309 non-null int64 \n",
" 2 SMOKING 309 non-null int64 \n",
" 3 YELLOW_FINGERS 309 non-null int64 \n",
" 4 ANXIETY 309 non-null int64 \n",
" 5 PEER_PRESSURE 309 non-null int64 \n",
" 6 CHRONIC DISEASE 309 non-null int64 \n",
" 7 FATIGUE 309 non-null int64 \n",
" 8 ALLERGY 309 non-null int64 \n",
" 9 WHEEZING 309 non-null int64 \n",
" 10 ALCOHOL CONSUMING 309 non-null int64 \n",
" 11 COUGHING 309 non-null int64 \n",
" 12 SHORTNESS OF BREATH 309 non-null int64 \n",
" 13 SWALLOWING DIFFICULTY 309 non-null int64 \n",
" 14 CHEST PAIN 309 non-null int64 \n",
" 15 LUNG_CANCER 309 non-null object\n",
"dtypes: int64(14), object(2)\n",
"memory usage: 38.8+ KB\n"
]
}
],
"source": [
"lung_data.info()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3dbb3974",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Int64Index: 276 entries, 0 to 283\n",
"Data columns (total 16 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 GENDER 276 non-null object\n",
" 1 AGE 276 non-null int64 \n",
" 2 SMOKING 276 non-null int64 \n",
" 3 YELLOW_FINGERS 276 non-null int64 \n",
" 4 ANXIETY 276 non-null int64 \n",
" 5 PEER_PRESSURE 276 non-null int64 \n",
" 6 CHRONIC DISEASE 276 non-null int64 \n",
" 7 FATIGUE 276 non-null int64 \n",
" 8 ALLERGY 276 non-null int64 \n",
" 9 WHEEZING 276 non-null int64 \n",
" 10 ALCOHOL CONSUMING 276 non-null int64 \n",
" 11 COUGHING 276 non-null int64 \n",
" 12 SHORTNESS OF BREATH 276 non-null int64 \n",
" 13 SWALLOWING DIFFICULTY 276 non-null int64 \n",
" 14 CHEST PAIN 276 non-null int64 \n",
" 15 LUNG_CANCER 276 non-null object\n",
"dtypes: int64(14), object(2)\n",
"memory usage: 36.7+ KB\n"
]
}
],
"source": [
"lung_data = lung_data.drop_duplicates()\n",
"lung_data.info()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "2c09b012",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GENDER | \n",
" SMOKING | \n",
" YELLOW_FINGERS | \n",
" ANXIETY | \n",
" PEER_PRESSURE | \n",
" CHRONIC DISEASE | \n",
" FATIGUE | \n",
" ALLERGY | \n",
" WHEEZING | \n",
" ALCOHOL CONSUMING | \n",
" COUGHING | \n",
" SHORTNESS OF BREATH | \n",
" SWALLOWING DIFFICULTY | \n",
" CHEST PAIN | \n",
" LUNG_CANCER | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GENDER SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE CHRONIC DISEASE \\\n",
"0 1 0 1 1 0 0 \n",
"1 1 1 0 0 0 1 \n",
"2 0 0 0 0 1 0 \n",
"3 1 1 1 1 0 0 \n",
"4 0 0 1 0 0 0 \n",
"\n",
" FATIGUE ALLERGY WHEEZING ALCOHOL CONSUMING COUGHING SHORTNESS OF BREATH \\\n",
"0 1 0 1 1 1 1 \n",
"1 1 1 0 0 0 1 \n",
"2 1 0 1 0 1 1 \n",
"3 0 0 0 1 0 0 \n",
"4 0 0 1 0 1 1 \n",
"\n",
" SWALLOWING DIFFICULTY CHEST PAIN LUNG_CANCER \n",
"0 1 1 1 \n",
"1 1 1 1 \n",
"2 0 1 0 \n",
"3 1 1 0 \n",
"4 0 0 0 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"categorical = lung_data.drop(['AGE'], axis = 1)\n",
"encoder = LabelEncoder()\n",
"for col in categorical.columns:\n",
" categorical[col] = encoder.fit_transform(categorical[col])\n",
"\n",
"categorical = categorical.astype(\"category\") \n",
"categorical.head()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b15e78ca",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GENDER | \n",
" SMOKING | \n",
" YELLOW_FINGERS | \n",
" ANXIETY | \n",
" PEER_PRESSURE | \n",
" CHRONIC DISEASE | \n",
" FATIGUE | \n",
" ALLERGY | \n",
" WHEEZING | \n",
" ALCOHOL CONSUMING | \n",
" COUGHING | \n",
" SHORTNESS OF BREATH | \n",
" SWALLOWING DIFFICULTY | \n",
" CHEST PAIN | \n",
" LUNG_CANCER | \n",
" AGE | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 69 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 74 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 59 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 63 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 63 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GENDER SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE CHRONIC DISEASE \\\n",
"0 1 0 1 1 0 0 \n",
"1 1 1 0 0 0 1 \n",
"2 0 0 0 0 1 0 \n",
"3 1 1 1 1 0 0 \n",
"4 0 0 1 0 0 0 \n",
"\n",
" FATIGUE ALLERGY WHEEZING ALCOHOL CONSUMING COUGHING SHORTNESS OF BREATH \\\n",
"0 1 0 1 1 1 1 \n",
"1 1 1 0 0 0 1 \n",
"2 1 0 1 0 1 1 \n",
"3 0 0 0 1 0 0 \n",
"4 0 0 1 0 1 1 \n",
"\n",
" SWALLOWING DIFFICULTY CHEST PAIN LUNG_CANCER AGE \n",
"0 1 1 1 69 \n",
"1 1 1 1 74 \n",
"2 0 1 0 59 \n",
"3 1 1 0 63 \n",
"4 0 0 0 63 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lung_data = pd.concat([categorical, lung_data['AGE']], axis = 1)\n",
"lung_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8925150b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 238\n",
"0 38\n",
"Name: LUNG_CANCER, dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lung_data.LUNG_CANCER.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c992c376",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Int64Index: 1000 entries, 183 to 87\n",
"Data columns (total 16 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 GENDER 1000 non-null category\n",
" 1 SMOKING 1000 non-null category\n",
" 2 YELLOW_FINGERS 1000 non-null category\n",
" 3 ANXIETY 1000 non-null category\n",
" 4 PEER_PRESSURE 1000 non-null category\n",
" 5 CHRONIC DISEASE 1000 non-null category\n",
" 6 FATIGUE 1000 non-null category\n",
" 7 ALLERGY 1000 non-null category\n",
" 8 WHEEZING 1000 non-null category\n",
" 9 ALCOHOL CONSUMING 1000 non-null category\n",
" 10 COUGHING 1000 non-null category\n",
" 11 SHORTNESS OF BREATH 1000 non-null category\n",
" 12 SWALLOWING DIFFICULTY 1000 non-null category\n",
" 13 CHEST PAIN 1000 non-null category\n",
" 14 LUNG_CANCER 1000 non-null category\n",
" 15 AGE 1000 non-null int64 \n",
"dtypes: category(15), int64(1)\n",
"memory usage: 32.1 KB\n"
]
}
],
"source": [
"class_0 = lung_data[lung_data['LUNG_CANCER'] == 0]\n",
"class_1 = lung_data[lung_data['LUNG_CANCER'] == 1]\n",
"class_1 = class_1.sample(n = 500, replace = True)\n",
"class_0 = class_0.sample(n = 500, replace = True)\n",
"lung_data = pd.concat([class_0, class_1], axis = 0)\n",
"lung_data.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ebc06c8e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 500\n",
"1 500\n",
"Name: LUNG_CANCER, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lung_data['LUNG_CANCER'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "b2ca517b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GENDER | \n",
" SMOKING | \n",
" YELLOW_FINGERS | \n",
" ANXIETY | \n",
" PEER_PRESSURE | \n",
" CHRONIC DISEASE | \n",
" FATIGUE | \n",
" ALLERGY | \n",
" WHEEZING | \n",
" ALCOHOL CONSUMING | \n",
" COUGHING | \n",
" SHORTNESS OF BREATH | \n",
" SWALLOWING DIFFICULTY | \n",
" CHEST PAIN | \n",
" LUNG_CANCER | \n",
" AGE | \n",
"
\n",
" \n",
" \n",
" \n",
" 183 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 71 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 63 | \n",
"
\n",
" \n",
" 37 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 56 | \n",
"
\n",
" \n",
" 14 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 69 | \n",
"
\n",
" \n",
" 8 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 68 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GENDER SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE CHRONIC DISEASE \\\n",
"183 0 1 0 0 0 1 \n",
"4 0 0 1 0 0 0 \n",
"37 0 0 0 0 0 1 \n",
"14 1 1 0 0 0 0 \n",
"8 0 1 0 1 0 0 \n",
"\n",
" FATIGUE ALLERGY WHEEZING ALCOHOL CONSUMING COUGHING SHORTNESS OF BREATH \\\n",
"183 1 0 0 0 0 1 \n",
"4 0 0 1 0 1 1 \n",
"37 0 0 1 0 0 1 \n",
"14 0 1 1 1 1 0 \n",
"8 1 0 0 0 0 0 \n",
"\n",
" SWALLOWING DIFFICULTY CHEST PAIN LUNG_CANCER AGE \n",
"183 0 0 0 71 \n",
"4 0 0 0 63 \n",
"37 1 0 0 56 \n",
"14 0 1 0 69 \n",
"8 0 0 0 68 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lung_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "62696544",
"metadata": {},
"outputs": [],
"source": [
"X = lung_data.drop('LUNG_CANCER', axis =1)\n",
"y = lung_data.LUNG_CANCER"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "6e4572a6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 80\n",
"0 70\n",
"Name: LUNG_CANCER, dtype: int64\n"
]
}
],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.15, random_state = 42)\n",
"\n",
"print(y_test.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "1b243af1",
"metadata": {},
"outputs": [],
"source": [
"categorical_columns = ['GENDER', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE',\n",
" 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',\n",
" 'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',\n",
" 'SWALLOWING DIFFICULTY', 'CHEST PAIN',\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "d0a038fd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.95 1.00 0.97 70\n",
" 1 1.00 0.95 0.97 80\n",
"\n",
" accuracy 0.97 150\n",
" macro avg 0.97 0.97 0.97 150\n",
"weighted avg 0.97 0.97 0.97 150\n",
"\n"
]
},
{
"data": {
"text/plain": [
"array([[70, 0],\n",
" [ 4, 76]], dtype=int64)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create DMatrix for training and test sets with categorical features enabled\n",
"X_train = X_train.astype({col: \"category\" for col in categorical_columns})\n",
"\n",
"dtrain = xgb.DMatrix(X_train , label=y_train, enable_categorical=True)\n",
"dtest = xgb.DMatrix(X_test, enable_categorical=True)\n",
"\n",
"# set parameters for XGBoost classifier\n",
"\n",
"params = {\n",
" 'objective': 'binary:logistic',\n",
" 'max_depth':3, \n",
" 'eta':1, \n",
" 'nthread': 3,\n",
" 'eval_metric': 'auc',\n",
" 'learning_rate': 1\n",
"}\n",
"\n",
"# train model\n",
"model = xgb.train(params, dtrain, num_boost_round=100)\n",
"# make predictions on test data\n",
"y_pred = model.predict(dtest)\n",
"\n",
"# convert probabilities to binary predictions\n",
"y_pred_binary = [1 if p >= 0.99 else 0 for p in y_pred]\n",
"\n",
"# evaluate model performance\n",
"print(classification_report(y_test, y_pred_binary))\n",
"confusion_matrix(y_test, y_pred_binary)"
]
},
{
"cell_type": "markdown",
"id": "d74abb37",
"metadata": {},
"source": [
"#### Lets assume i want to make predictions with that of a new patient coming in"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "cd579274",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" GENDER | \n",
" SMOKING | \n",
" YELLOW_FINGERS | \n",
" ANXIETY | \n",
" PEER_PRESSURE | \n",
" CHRONIC DISEASE | \n",
" FATIGUE | \n",
" ALLERGY | \n",
" WHEEZING | \n",
" ALCOHOL CONSUMING | \n",
" COUGHING | \n",
" SHORTNESS OF BREATH | \n",
" SWALLOWING DIFFICULTY | \n",
" CHEST PAIN | \n",
" AGE | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 25 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" GENDER SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE CHRONIC DISEASE \\\n",
"0 1 0 0 1 0 0 \n",
"\n",
" FATIGUE ALLERGY WHEEZING ALCOHOL CONSUMING COUGHING \\\n",
"0 0 0 0 0 1 \n",
"\n",
" SHORTNESS OF BREATH SWALLOWING DIFFICULTY CHEST PAIN AGE \n",
"0 0 0 1 25 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"input_dict = {'GENDER': 1, 'SMOKING': 0, 'YELLOW_FINGERS':0,\n",
" 'ANXIETY': 1, 'PEER_PRESSURE': 0,\n",
" 'CHRONIC DISEASE': 0, 'FATIGUE ': 0,\n",
" 'ALLERGY ': 0, 'WHEEZING': 0,\n",
" 'ALCOHOL CONSUMING': 0, 'COUGHING': 1,\n",
" 'SHORTNESS OF BREATH': 0,\n",
" 'SWALLOWING DIFFICULTY': 0,\n",
" 'CHEST PAIN': 1, 'AGE': 25}\n",
"input_df = pd.DataFrame.from_dict([input_dict])\n",
"input_df.astype({col: \"category\" for col in categorical_columns})\n",
"input_df"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "0f6dc7eb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 1 entries, 0 to 0\n",
"Data columns (total 15 columns):\n",
" # Column Non-Null Count Dtype\n",
"--- ------ -------------- -----\n",
" 0 GENDER 1 non-null int64\n",
" 1 SMOKING 1 non-null int64\n",
" 2 YELLOW_FINGERS 1 non-null int64\n",
" 3 ANXIETY 1 non-null int64\n",
" 4 PEER_PRESSURE 1 non-null int64\n",
" 5 CHRONIC DISEASE 1 non-null int64\n",
" 6 FATIGUE 1 non-null int64\n",
" 7 ALLERGY 1 non-null int64\n",
" 8 WHEEZING 1 non-null int64\n",
" 9 ALCOHOL CONSUMING 1 non-null int64\n",
" 10 COUGHING 1 non-null int64\n",
" 11 SHORTNESS OF BREATH 1 non-null int64\n",
" 12 SWALLOWING DIFFICULTY 1 non-null int64\n",
" 13 CHEST PAIN 1 non-null int64\n",
" 14 AGE 1 non-null int64\n",
"dtypes: int64(15)\n",
"memory usage: 248.0 bytes\n"
]
}
],
"source": [
"input_df.info()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "afb5ec1e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.20313403], dtype=float32)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"categorical_columns = ['GENDER', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE',\n",
" 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',\n",
" 'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',\n",
" 'SWALLOWING DIFFICULTY', 'CHEST PAIN',\n",
"]\n",
"\n",
"\n",
"dtest = xgb.DMatrix(input_df)\n",
"\n",
"prediction = model.predict(dtest)\n",
"prediction"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91746991",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "728ee97e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 19,
"id": "fff3df16",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running on local URL: http://127.0.0.1:7860\n",
"Running on public URL: https://de43ecf59e54e5afd9.gradio.live\n",
"\n",
"This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces\n"
]
},
{
"data": {
"text/plain": []
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import gradio as gr\n",
"\n",
"# Define the Gradio input and output interfaces\n",
"inputs = [\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Gender\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you smoke?\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Yellow Fingers\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Anxiety\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you get influenced by Peer Pressure\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have any Chronic Disease\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Fatigue\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have an Allergy\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you experience Wheezing\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you drink alcohol\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Are you Coughing\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Shortness of Breath\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Swallowing Difficulty\"),\n",
" gr.inputs.Dropdown(choices=[\"0\", \"1\"], label=\"Do you have Chest Pain\"),\n",
" gr.inputs.Number(label='Age')\n",
"]\n",
"\n",
"output = gr.outputs.Label(num_top_classes=2)\n",
"\n",
"# Define the predict function\n",
"def predict(gender, smoking, yellow_fingers, anxiety, peer_pressure,\n",
" chronic_disease, fatigue, allergy, wheezing, alcohol_consuming,\n",
" coughing, shortness_of_breath, swallowing_difficulty, chest_pain,\n",
" age):\n",
" # Create a dataframe with the input values\n",
" input_dict = {'GENDER': gender, 'SMOKING': smoking, 'YELLOW_FINGERS': yellow_fingers,\n",
" 'ANXIETY': anxiety, 'PEER_PRESSURE': peer_pressure,\n",
" 'CHRONIC DISEASE': chronic_disease, 'FATIGUE ': fatigue,\n",
" 'ALLERGY ': allergy, 'WHEEZING': wheezing,\n",
" 'ALCOHOL CONSUMING': alcohol_consuming, 'COUGHING': coughing,\n",
" 'SHORTNESS OF BREATH': shortness_of_breath,\n",
" 'SWALLOWING DIFFICULTY': swallowing_difficulty,\n",
" 'CHEST PAIN': chest_pain, 'AGE': age}\n",
" input_df = pd.DataFrame.from_dict([input_dict]).astype(\"int\")\n",
" \n",
" dtest = xgb.DMatrix(input_df)\n",
" \n",
" \n",
" #make predictions\n",
" prediction = model.predict(dtest)\n",
" \n",
" # Return prediction\n",
" return \"You have Lung Cancer, you might want to see the Doctor.\" if prediction >0.99 else \"You don't have Lung Cancer, Enjoyā¯¤\"\n",
"\n",
"# Create and launch the interface\n",
"interface = gr.Interface(fn=predict, inputs=inputs, outputs=output, \n",
" title='Lung Cancer Prediction', description='Predicting lung cancer using XGBoost Classifier.\\nPlease Note:\\nFemale = 0, Male= 1\\nNo = 0, Yes = 1')\n",
"interface.launch(auth = ('user', 'atom'), share = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de27fbe1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}