{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "LnPbntVRnfvV"
},
"source": [
"Importing the Dependencies"
]
},
{
"cell_type": "code",
"metadata": {
"id": "-71UtHzNVWjB"
},
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import svm\n",
"from sklearn.metrics import accuracy_score"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "bmfOfG8joBBy"
},
"source": [
"Data Collection and Analysis\n",
"\n",
"PIMA Diabetes Dataset"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Xpw6Mj_pn_TL"
},
"source": [
"# loading the diabetes dataset to a pandas DataFrame\n",
"diabetes_dataset = pd.read_csv('/content/diabetes.csv')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "-tjO09ncovoh",
"outputId": "0f5f8129-eb57-4ba0-f329-312bba4aae27"
},
"source": [
"# printing the first 5 rows of the dataset\n",
"diabetes_dataset.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure ... DiabetesPedigreeFunction Age Outcome\n",
"0 6 148 72 ... 0.627 50 1\n",
"1 1 85 66 ... 0.351 31 0\n",
"2 8 183 64 ... 0.672 32 1\n",
"3 1 89 66 ... 0.167 21 0\n",
"4 0 137 40 ... 2.288 33 1\n",
"\n",
"[5 rows x 9 columns]"
]
},
"metadata": {},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "lynParo6pEMB",
"outputId": "ab7d817a-1f20-46d0-d504-833efb433f7d"
},
"source": [
"# number of rows and Columns in this dataset\n",
"diabetes_dataset.shape"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(768, 9)"
]
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 300
},
"id": "3NDJOlrEpmoL",
"outputId": "12af9f8e-b5fb-4f7f-a4bb-f5df64cce508"
},
"source": [
"# getting the statistical measures of the data\n",
"diabetes_dataset.describe()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 3.845052 | \n",
" 120.894531 | \n",
" 69.105469 | \n",
" 20.536458 | \n",
" 79.799479 | \n",
" 31.992578 | \n",
" 0.471876 | \n",
" 33.240885 | \n",
" 0.348958 | \n",
"
\n",
" \n",
" std | \n",
" 3.369578 | \n",
" 31.972618 | \n",
" 19.355807 | \n",
" 15.952218 | \n",
" 115.244002 | \n",
" 7.884160 | \n",
" 0.331329 | \n",
" 11.760232 | \n",
" 0.476951 | \n",
"
\n",
" \n",
" min | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.078000 | \n",
" 21.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 1.000000 | \n",
" 99.000000 | \n",
" 62.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 27.300000 | \n",
" 0.243750 | \n",
" 24.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 3.000000 | \n",
" 117.000000 | \n",
" 72.000000 | \n",
" 23.000000 | \n",
" 30.500000 | \n",
" 32.000000 | \n",
" 0.372500 | \n",
" 29.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 6.000000 | \n",
" 140.250000 | \n",
" 80.000000 | \n",
" 32.000000 | \n",
" 127.250000 | \n",
" 36.600000 | \n",
" 0.626250 | \n",
" 41.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" max | \n",
" 17.000000 | \n",
" 199.000000 | \n",
" 122.000000 | \n",
" 99.000000 | \n",
" 846.000000 | \n",
" 67.100000 | \n",
" 2.420000 | \n",
" 81.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose ... Age Outcome\n",
"count 768.000000 768.000000 ... 768.000000 768.000000\n",
"mean 3.845052 120.894531 ... 33.240885 0.348958\n",
"std 3.369578 31.972618 ... 11.760232 0.476951\n",
"min 0.000000 0.000000 ... 21.000000 0.000000\n",
"25% 1.000000 99.000000 ... 24.000000 0.000000\n",
"50% 3.000000 117.000000 ... 29.000000 0.000000\n",
"75% 6.000000 140.250000 ... 41.000000 1.000000\n",
"max 17.000000 199.000000 ... 81.000000 1.000000\n",
"\n",
"[8 rows x 9 columns]"
]
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LrpHzaGpp5dQ",
"outputId": "916953df-2cee-43a9-cc80-2e58fe6b43d2"
},
"source": [
"diabetes_dataset['Outcome'].value_counts()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0 500\n",
"1 268\n",
"Name: Outcome, dtype: int64"
]
},
"metadata": {},
"execution_count": 6
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cB1qRaNcqeh5"
},
"source": [
"0 --> Non-Diabetic\n",
"\n",
"1 --> Diabetic"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "I6MWR0k_qSCK",
"outputId": "47b23d5c-8973-4868-8582-b0fa95bfed46"
},
"source": [
"diabetes_dataset.groupby('Outcome').mean()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
"
\n",
" \n",
" Outcome | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3.298000 | \n",
" 109.980000 | \n",
" 68.184000 | \n",
" 19.664000 | \n",
" 68.792000 | \n",
" 30.304200 | \n",
" 0.429734 | \n",
" 31.190000 | \n",
"
\n",
" \n",
" 1 | \n",
" 4.865672 | \n",
" 141.257463 | \n",
" 70.824627 | \n",
" 22.164179 | \n",
" 100.335821 | \n",
" 35.142537 | \n",
" 0.550500 | \n",
" 37.067164 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose ... DiabetesPedigreeFunction Age\n",
"Outcome ... \n",
"0 3.298000 109.980000 ... 0.429734 31.190000\n",
"1 4.865672 141.257463 ... 0.550500 37.067164\n",
"\n",
"[2 rows x 8 columns]"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "RoDW7l9mqqHZ"
},
"source": [
"# separating the data and labels\n",
"X = diabetes_dataset.drop(columns = 'Outcome', axis=1)\n",
"Y = diabetes_dataset['Outcome']"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3eiRW9M9raMm",
"outputId": "552c0851-90ec-4068-812d-c848224be8a7"
},
"source": [
"print(X)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" Pregnancies Glucose BloodPressure ... BMI DiabetesPedigreeFunction Age\n",
"0 6 148 72 ... 33.6 0.627 50\n",
"1 1 85 66 ... 26.6 0.351 31\n",
"2 8 183 64 ... 23.3 0.672 32\n",
"3 1 89 66 ... 28.1 0.167 21\n",
"4 0 137 40 ... 43.1 2.288 33\n",
".. ... ... ... ... ... ... ...\n",
"763 10 101 76 ... 32.9 0.171 63\n",
"764 2 122 70 ... 36.8 0.340 27\n",
"765 5 121 72 ... 26.2 0.245 30\n",
"766 1 126 60 ... 30.1 0.349 47\n",
"767 1 93 70 ... 30.4 0.315 23\n",
"\n",
"[768 rows x 8 columns]\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AoxgTJAMrcCl",
"outputId": "d6f83516-18e5-41ca-c6ce-4495bdf733cb"
},
"source": [
"print(Y)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0 1\n",
"1 0\n",
"2 1\n",
"3 0\n",
"4 1\n",
" ..\n",
"763 0\n",
"764 0\n",
"765 0\n",
"766 1\n",
"767 0\n",
"Name: Outcome, Length: 768, dtype: int64\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "gHciEFkxsoQP"
},
"source": [
"Train Test Split"
]
},
{
"cell_type": "code",
"metadata": {
"id": "AEfKGj_yslvD"
},
"source": [
"X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "DR05T-o0t3FQ",
"outputId": "2b7c195d-58d7-4c4d-803d-34e09791b07a"
},
"source": [
"print(X.shape, X_train.shape, X_test.shape)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"(768, 8) (614, 8) (154, 8)\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ElJ3tkOtuC_n"
},
"source": [
"Training the Model"
]
},
{
"cell_type": "code",
"metadata": {
"id": "5szLWHlNt9xc"
},
"source": [
"classifier = svm.SVC(kernel='linear')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ncJWY_7suPAb",
"outputId": "1a8fc42b-37a5-4e59-d52a-5dd5e09560e8"
},
"source": [
"#training the support vector Machine Classifier\n",
"classifier.fit(X_train, Y_train)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"SVC(kernel='linear')"
]
},
"metadata": {},
"execution_count": 14
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UV4-CAfquiyP"
},
"source": [
"Model Evaluation"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "yhAjGPJWunXa"
},
"source": [
"Accuracy Score"
]
},
{
"cell_type": "code",
"metadata": {
"id": "fJLEPQK7ueXp"
},
"source": [
"# accuracy score on the training data\n",
"X_train_prediction = classifier.predict(X_train)\n",
"training_data_accuracy = accuracy_score(X_train_prediction, Y_train)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mmJ22qhVvNwj",
"outputId": "1b1c3d32-b9f2-40c0-89ed-5d59b674cdfe"
},
"source": [
"print('Accuracy score of the training data : ', training_data_accuracy)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy score of the training data : 0.7833876221498371\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "G2CICFMEvcCl"
},
"source": [
"# accuracy score on the test data\n",
"X_test_prediction = classifier.predict(X_test)\n",
"test_data_accuracy = accuracy_score(X_test_prediction, Y_test)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "i2GcW_t_vz7C",
"outputId": "a65c3281-1621-4c8f-b57f-bbf0bc81d129"
},
"source": [
"print('Accuracy score of the test data : ', test_data_accuracy)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy score of the test data : 0.7727272727272727\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "gq8ZX1xpwPF5"
},
"source": [
"Making a Predictive System"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "U-ULRe4yv5tH",
"outputId": "63b3fd00-f094-4642-b45e-3eb21331c3df"
},
"source": [
"input_data = (5,166,72,19,175,25.8,0.587,51)\n",
"\n",
"# changing the input_data to numpy array\n",
"input_data_as_numpy_array = np.asarray(input_data)\n",
"\n",
"# reshape the array as we are predicting for one instance\n",
"input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)\n",
"\n",
"prediction = classifier.predict(input_data_reshaped)\n",
"print(prediction)\n",
"\n",
"if (prediction[0] == 0):\n",
" print('The person is not diabetic')\n",
"else:\n",
" print('The person is diabetic')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[1]\n",
"The person is diabetic\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/sklearn/base.py:446: UserWarning: X does not have valid feature names, but SVC was fitted with feature names\n",
" \"X does not have valid feature names, but\"\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "vgL6wblpQUtX"
},
"source": [
"Saving the trained model"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Nn60MdxByjgz"
},
"source": [
"import pickle"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "cWzPQs4mQZN_"
},
"source": [
"filename = 'trained_model.sav'\n",
"pickle.dump(classifier, open(filename, 'wb'))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Wk1T2sMcQ6_U"
},
"source": [
"# loading the saved model\n",
"loaded_model = pickle.load(open('trained_model.sav', 'rb'))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Bd5OpxHnRPyy",
"outputId": "abd39207-0fea-4c68-e91b-710244c8e73d"
},
"source": [
"input_data = (5,166,72,19,175,25.8,0.587,51)\n",
"\n",
"# changing the input_data to numpy array\n",
"input_data_as_numpy_array = np.asarray(input_data)\n",
"\n",
"# reshape the array as we are predicting for one instance\n",
"input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)\n",
"\n",
"prediction = loaded_model.predict(input_data_reshaped)\n",
"print(prediction)\n",
"\n",
"if (prediction[0] == 0):\n",
" print('The person is not diabetic')\n",
"else:\n",
" print('The person is diabetic')"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[1]\n",
"The person is diabetic\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/sklearn/base.py:446: UserWarning: X does not have valid feature names, but SVC was fitted with feature names\n",
" \"X does not have valid feature names, but\"\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "iGRhGvgfRkvm"
},
"source": [],
"execution_count": null,
"outputs": []
}
]
}