Spaces:
Runtime error
Runtime error
panchajanya1999
commited on
Add initial application files
Browse filesSigned-off-by: Panchajanya1999 <[email protected]>
- app.py +4 -0
- dataset/spam.tsv +0 -0
- spam_classifier.ipynb +1012 -0
- spam_classifier.py +80 -0
app.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# this is the python file to run the spam_classifer.py file
|
2 |
+
|
3 |
+
with open('spam_classifier.py') as f:
|
4 |
+
exec(f.read())
|
dataset/spam.tsv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
spam_classifier.ipynb
ADDED
@@ -0,0 +1,1012 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"attachments": {},
|
5 |
+
"cell_type": "markdown",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Spam Classifer\n",
|
9 |
+
"\n",
|
10 |
+
"### Classification of SPAM or HAM using standard classifers"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 1,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"# import libraries\n",
|
20 |
+
"import pandas as pd\n",
|
21 |
+
"import numpy as np\n",
|
22 |
+
"import matplotlib.pyplot as plt\n",
|
23 |
+
"\n",
|
24 |
+
"# import string\n",
|
25 |
+
"import string\n",
|
26 |
+
"\n",
|
27 |
+
"# import countvectorizer\n",
|
28 |
+
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
29 |
+
"\n",
|
30 |
+
"# import train_test_split\n",
|
31 |
+
"from sklearn.model_selection import train_test_split\n",
|
32 |
+
"\n",
|
33 |
+
"# import a bunch of classifiers and compare their accuracy with the train and test data\n",
|
34 |
+
"from sklearn.naive_bayes import MultinomialNB\n",
|
35 |
+
"from sklearn.linear_model import LogisticRegression\n",
|
36 |
+
"from sklearn.ensemble import RandomForestClassifier\n",
|
37 |
+
"from sklearn.svm import SVC\n",
|
38 |
+
"from sklearn.neighbors import KNeighborsClassifier\n",
|
39 |
+
"from sklearn.tree import DecisionTreeClassifier\n",
|
40 |
+
"from sklearn.ensemble import AdaBoostClassifier\n",
|
41 |
+
"from sklearn.ensemble import GradientBoostingClassifier\n",
|
42 |
+
"from sklearn.ensemble import ExtraTreesClassifier\n",
|
43 |
+
"from sklearn.naive_bayes import GaussianNB\n",
|
44 |
+
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
|
45 |
+
"from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n",
|
46 |
+
"from sklearn.linear_model import SGDClassifier\n",
|
47 |
+
"from sklearn.linear_model import Perceptron\n",
|
48 |
+
"from sklearn.linear_model import PassiveAggressiveClassifier\n",
|
49 |
+
"from sklearn.neural_network import MLPClassifier\n",
|
50 |
+
"from sklearn.gaussian_process import GaussianProcessClassifier\n",
|
51 |
+
"\n",
|
52 |
+
"# import accuracy score\n",
|
53 |
+
"from sklearn.metrics import accuracy_score\n",
|
54 |
+
"\n",
|
55 |
+
"# import confusion matrix\n",
|
56 |
+
"from sklearn.metrics import confusion_matrix"
|
57 |
+
]
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"cell_type": "code",
|
61 |
+
"execution_count": 2,
|
62 |
+
"metadata": {},
|
63 |
+
"outputs": [
|
64 |
+
{
|
65 |
+
"data": {
|
66 |
+
"text/html": [
|
67 |
+
"<div>\n",
|
68 |
+
"<style scoped>\n",
|
69 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
70 |
+
" vertical-align: middle;\n",
|
71 |
+
" }\n",
|
72 |
+
"\n",
|
73 |
+
" .dataframe tbody tr th {\n",
|
74 |
+
" vertical-align: top;\n",
|
75 |
+
" }\n",
|
76 |
+
"\n",
|
77 |
+
" .dataframe thead th {\n",
|
78 |
+
" text-align: right;\n",
|
79 |
+
" }\n",
|
80 |
+
"</style>\n",
|
81 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
82 |
+
" <thead>\n",
|
83 |
+
" <tr style=\"text-align: right;\">\n",
|
84 |
+
" <th></th>\n",
|
85 |
+
" <th>label</th>\n",
|
86 |
+
" <th>message</th>\n",
|
87 |
+
" </tr>\n",
|
88 |
+
" </thead>\n",
|
89 |
+
" <tbody>\n",
|
90 |
+
" <tr>\n",
|
91 |
+
" <th>0</th>\n",
|
92 |
+
" <td>ham</td>\n",
|
93 |
+
" <td>I've been searching for the right words to tha...</td>\n",
|
94 |
+
" </tr>\n",
|
95 |
+
" <tr>\n",
|
96 |
+
" <th>1</th>\n",
|
97 |
+
" <td>spam</td>\n",
|
98 |
+
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
|
99 |
+
" </tr>\n",
|
100 |
+
" <tr>\n",
|
101 |
+
" <th>2</th>\n",
|
102 |
+
" <td>ham</td>\n",
|
103 |
+
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
|
104 |
+
" </tr>\n",
|
105 |
+
" <tr>\n",
|
106 |
+
" <th>3</th>\n",
|
107 |
+
" <td>ham</td>\n",
|
108 |
+
" <td>Even my brother is not like to speak with me. ...</td>\n",
|
109 |
+
" </tr>\n",
|
110 |
+
" <tr>\n",
|
111 |
+
" <th>4</th>\n",
|
112 |
+
" <td>ham</td>\n",
|
113 |
+
" <td>I HAVE A DATE ON SUNDAY WITH WILL!!!</td>\n",
|
114 |
+
" </tr>\n",
|
115 |
+
" </tbody>\n",
|
116 |
+
"</table>\n",
|
117 |
+
"</div>"
|
118 |
+
],
|
119 |
+
"text/plain": [
|
120 |
+
" label message\n",
|
121 |
+
"0 ham I've been searching for the right words to tha...\n",
|
122 |
+
"1 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
|
123 |
+
"2 ham Nah I don't think he goes to usf, he lives aro...\n",
|
124 |
+
"3 ham Even my brother is not like to speak with me. ...\n",
|
125 |
+
"4 ham I HAVE A DATE ON SUNDAY WITH WILL!!!"
|
126 |
+
]
|
127 |
+
},
|
128 |
+
"execution_count": 2,
|
129 |
+
"metadata": {},
|
130 |
+
"output_type": "execute_result"
|
131 |
+
}
|
132 |
+
],
|
133 |
+
"source": [
|
134 |
+
"df = pd.read_csv('spam.tsv', sep='\\t', names=['label', 'message'])\n",
|
135 |
+
"df.head()"
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "code",
|
140 |
+
"execution_count": 5,
|
141 |
+
"metadata": {},
|
142 |
+
"outputs": [
|
143 |
+
{
|
144 |
+
"name": "stdout",
|
145 |
+
"output_type": "stream",
|
146 |
+
"text": [
|
147 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
148 |
+
"RangeIndex: 5567 entries, 0 to 5566\n",
|
149 |
+
"Data columns (total 2 columns):\n",
|
150 |
+
" # Column Non-Null Count Dtype \n",
|
151 |
+
"--- ------ -------------- ----- \n",
|
152 |
+
" 0 label 5567 non-null object\n",
|
153 |
+
" 1 message 5567 non-null object\n",
|
154 |
+
"dtypes: object(2)\n",
|
155 |
+
"memory usage: 87.1+ KB\n"
|
156 |
+
]
|
157 |
+
}
|
158 |
+
],
|
159 |
+
"source": [
|
160 |
+
"# check info of the dataset\n",
|
161 |
+
"df.info()"
|
162 |
+
]
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"cell_type": "code",
|
166 |
+
"execution_count": 6,
|
167 |
+
"metadata": {},
|
168 |
+
"outputs": [
|
169 |
+
{
|
170 |
+
"data": {
|
171 |
+
"text/html": [
|
172 |
+
"<div>\n",
|
173 |
+
"<style scoped>\n",
|
174 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
175 |
+
" vertical-align: middle;\n",
|
176 |
+
" }\n",
|
177 |
+
"\n",
|
178 |
+
" .dataframe tbody tr th {\n",
|
179 |
+
" vertical-align: top;\n",
|
180 |
+
" }\n",
|
181 |
+
"\n",
|
182 |
+
" .dataframe thead th {\n",
|
183 |
+
" text-align: right;\n",
|
184 |
+
" }\n",
|
185 |
+
"</style>\n",
|
186 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
187 |
+
" <thead>\n",
|
188 |
+
" <tr style=\"text-align: right;\">\n",
|
189 |
+
" <th></th>\n",
|
190 |
+
" <th>label</th>\n",
|
191 |
+
" <th>message</th>\n",
|
192 |
+
" <th>length</th>\n",
|
193 |
+
" </tr>\n",
|
194 |
+
" </thead>\n",
|
195 |
+
" <tbody>\n",
|
196 |
+
" <tr>\n",
|
197 |
+
" <th>0</th>\n",
|
198 |
+
" <td>ham</td>\n",
|
199 |
+
" <td>I've been searching for the right words to tha...</td>\n",
|
200 |
+
" <td>196</td>\n",
|
201 |
+
" </tr>\n",
|
202 |
+
" <tr>\n",
|
203 |
+
" <th>1</th>\n",
|
204 |
+
" <td>spam</td>\n",
|
205 |
+
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
|
206 |
+
" <td>155</td>\n",
|
207 |
+
" </tr>\n",
|
208 |
+
" <tr>\n",
|
209 |
+
" <th>2</th>\n",
|
210 |
+
" <td>ham</td>\n",
|
211 |
+
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
|
212 |
+
" <td>61</td>\n",
|
213 |
+
" </tr>\n",
|
214 |
+
" <tr>\n",
|
215 |
+
" <th>3</th>\n",
|
216 |
+
" <td>ham</td>\n",
|
217 |
+
" <td>Even my brother is not like to speak with me. ...</td>\n",
|
218 |
+
" <td>77</td>\n",
|
219 |
+
" </tr>\n",
|
220 |
+
" <tr>\n",
|
221 |
+
" <th>4</th>\n",
|
222 |
+
" <td>ham</td>\n",
|
223 |
+
" <td>I HAVE A DATE ON SUNDAY WITH WILL!!!</td>\n",
|
224 |
+
" <td>36</td>\n",
|
225 |
+
" </tr>\n",
|
226 |
+
" <tr>\n",
|
227 |
+
" <th>5</th>\n",
|
228 |
+
" <td>ham</td>\n",
|
229 |
+
" <td>As per your request 'Melle Melle (Oru Minnamin...</td>\n",
|
230 |
+
" <td>160</td>\n",
|
231 |
+
" </tr>\n",
|
232 |
+
" <tr>\n",
|
233 |
+
" <th>6</th>\n",
|
234 |
+
" <td>spam</td>\n",
|
235 |
+
" <td>WINNER!! As a valued network customer you have...</td>\n",
|
236 |
+
" <td>157</td>\n",
|
237 |
+
" </tr>\n",
|
238 |
+
" <tr>\n",
|
239 |
+
" <th>7</th>\n",
|
240 |
+
" <td>spam</td>\n",
|
241 |
+
" <td>Had your mobile 11 months or more? U R entitle...</td>\n",
|
242 |
+
" <td>154</td>\n",
|
243 |
+
" </tr>\n",
|
244 |
+
" <tr>\n",
|
245 |
+
" <th>8</th>\n",
|
246 |
+
" <td>ham</td>\n",
|
247 |
+
" <td>I'm gonna be home soon and i don't want to tal...</td>\n",
|
248 |
+
" <td>109</td>\n",
|
249 |
+
" </tr>\n",
|
250 |
+
" <tr>\n",
|
251 |
+
" <th>9</th>\n",
|
252 |
+
" <td>spam</td>\n",
|
253 |
+
" <td>SIX chances to win CASH! From 100 to 20,000 po...</td>\n",
|
254 |
+
" <td>136</td>\n",
|
255 |
+
" </tr>\n",
|
256 |
+
" </tbody>\n",
|
257 |
+
"</table>\n",
|
258 |
+
"</div>"
|
259 |
+
],
|
260 |
+
"text/plain": [
|
261 |
+
" label message length\n",
|
262 |
+
"0 ham I've been searching for the right words to tha... 196\n",
|
263 |
+
"1 spam Free entry in 2 a wkly comp to win FA Cup fina... 155\n",
|
264 |
+
"2 ham Nah I don't think he goes to usf, he lives aro... 61\n",
|
265 |
+
"3 ham Even my brother is not like to speak with me. ... 77\n",
|
266 |
+
"4 ham I HAVE A DATE ON SUNDAY WITH WILL!!! 36\n",
|
267 |
+
"5 ham As per your request 'Melle Melle (Oru Minnamin... 160\n",
|
268 |
+
"6 spam WINNER!! As a valued network customer you have... 157\n",
|
269 |
+
"7 spam Had your mobile 11 months or more? U R entitle... 154\n",
|
270 |
+
"8 ham I'm gonna be home soon and i don't want to tal... 109\n",
|
271 |
+
"9 spam SIX chances to win CASH! From 100 to 20,000 po... 136"
|
272 |
+
]
|
273 |
+
},
|
274 |
+
"execution_count": 6,
|
275 |
+
"metadata": {},
|
276 |
+
"output_type": "execute_result"
|
277 |
+
}
|
278 |
+
],
|
279 |
+
"source": [
|
280 |
+
"# add a new column to the dataset to count the length of the message\n",
|
281 |
+
"df['length'] = df['message'].apply(len)\n",
|
282 |
+
"df.head(10)"
|
283 |
+
]
|
284 |
+
},
|
285 |
+
{
|
286 |
+
"cell_type": "code",
|
287 |
+
"execution_count": 7,
|
288 |
+
"metadata": {},
|
289 |
+
"outputs": [
|
290 |
+
{
|
291 |
+
"data": {
|
292 |
+
"text/html": [
|
293 |
+
"<div>\n",
|
294 |
+
"<style scoped>\n",
|
295 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
296 |
+
" vertical-align: middle;\n",
|
297 |
+
" }\n",
|
298 |
+
"\n",
|
299 |
+
" .dataframe tbody tr th {\n",
|
300 |
+
" vertical-align: top;\n",
|
301 |
+
" }\n",
|
302 |
+
"\n",
|
303 |
+
" .dataframe thead th {\n",
|
304 |
+
" text-align: right;\n",
|
305 |
+
" }\n",
|
306 |
+
"</style>\n",
|
307 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
308 |
+
" <thead>\n",
|
309 |
+
" <tr style=\"text-align: right;\">\n",
|
310 |
+
" <th></th>\n",
|
311 |
+
" <th>length</th>\n",
|
312 |
+
" </tr>\n",
|
313 |
+
" <tr>\n",
|
314 |
+
" <th>label</th>\n",
|
315 |
+
" <th></th>\n",
|
316 |
+
" </tr>\n",
|
317 |
+
" </thead>\n",
|
318 |
+
" <tbody>\n",
|
319 |
+
" <tr>\n",
|
320 |
+
" <th>ham</th>\n",
|
321 |
+
" <td>71.442854</td>\n",
|
322 |
+
" </tr>\n",
|
323 |
+
" <tr>\n",
|
324 |
+
" <th>spam</th>\n",
|
325 |
+
" <td>138.659517</td>\n",
|
326 |
+
" </tr>\n",
|
327 |
+
" </tbody>\n",
|
328 |
+
"</table>\n",
|
329 |
+
"</div>"
|
330 |
+
],
|
331 |
+
"text/plain": [
|
332 |
+
" length\n",
|
333 |
+
"label \n",
|
334 |
+
"ham 71.442854\n",
|
335 |
+
"spam 138.659517"
|
336 |
+
]
|
337 |
+
},
|
338 |
+
"execution_count": 7,
|
339 |
+
"metadata": {},
|
340 |
+
"output_type": "execute_result"
|
341 |
+
}
|
342 |
+
],
|
343 |
+
"source": [
|
344 |
+
"# check the mean length of message of ham and spam messages\n",
|
345 |
+
"df.groupby('label').mean()"
|
346 |
+
]
|
347 |
+
},
|
348 |
+
{
|
349 |
+
"attachments": {},
|
350 |
+
"cell_type": "markdown",
|
351 |
+
"metadata": {},
|
352 |
+
"source": [
|
353 |
+
"Seems like spams are geenrally longer than ham. So, we can use the length of the message as a feature."
|
354 |
+
]
|
355 |
+
},
|
356 |
+
{
|
357 |
+
"cell_type": "code",
|
358 |
+
"execution_count": 8,
|
359 |
+
"metadata": {},
|
360 |
+
"outputs": [
|
361 |
+
{
|
362 |
+
"name": "stdout",
|
363 |
+
"output_type": "stream",
|
364 |
+
"text": [
|
365 |
+
"spam\n",
|
366 |
+
"count 746.000000\n",
|
367 |
+
"mean 138.659517\n",
|
368 |
+
"std 28.891361\n",
|
369 |
+
"min 13.000000\n",
|
370 |
+
"25% 133.000000\n",
|
371 |
+
"50% 149.000000\n",
|
372 |
+
"75% 157.000000\n",
|
373 |
+
"max 223.000000\n",
|
374 |
+
"Name: length, dtype: float64\n",
|
375 |
+
"\n",
|
376 |
+
"ham\n",
|
377 |
+
"count 4821.000000\n",
|
378 |
+
"mean 71.442854\n",
|
379 |
+
"std 58.373866\n",
|
380 |
+
"min 2.000000\n",
|
381 |
+
"25% 33.000000\n",
|
382 |
+
"50% 52.000000\n",
|
383 |
+
"75% 93.000000\n",
|
384 |
+
"max 910.000000\n",
|
385 |
+
"Name: length, dtype: float64\n",
|
386 |
+
"\n"
|
387 |
+
]
|
388 |
+
}
|
389 |
+
],
|
390 |
+
"source": [
|
391 |
+
"features = ['spam', 'ham']\n",
|
392 |
+
"\n",
|
393 |
+
"# check the description of the messages for each feature\n",
|
394 |
+
"for i in features:\n",
|
395 |
+
" print(i)\n",
|
396 |
+
" print(df[df['label'] == i]['length'].describe())\n",
|
397 |
+
" print()"
|
398 |
+
]
|
399 |
+
},
|
400 |
+
{
|
401 |
+
"attachments": {},
|
402 |
+
"cell_type": "markdown",
|
403 |
+
"metadata": {},
|
404 |
+
"source": [
|
405 |
+
"## Data Preprocessing"
|
406 |
+
]
|
407 |
+
},
|
408 |
+
{
|
409 |
+
"attachments": {},
|
410 |
+
"cell_type": "markdown",
|
411 |
+
"metadata": {},
|
412 |
+
"source": [
|
413 |
+
"We will first remove the punctuations from the messages and then tokenize them. We will then remove the stopwords and then stem the words. We will then create a bag of words model."
|
414 |
+
]
|
415 |
+
},
|
416 |
+
{
|
417 |
+
"cell_type": "code",
|
418 |
+
"execution_count": 9,
|
419 |
+
"metadata": {},
|
420 |
+
"outputs": [],
|
421 |
+
"source": [
|
422 |
+
"# write a function to remove punctuations from meassages\n",
|
423 |
+
"def remove_punctuation(text):\n",
|
424 |
+
" no_punct = [char for char in text if char not in string.punctuation]\n",
|
425 |
+
" no_punct = ''.join(no_punct)\n",
|
426 |
+
" return no_punct"
|
427 |
+
]
|
428 |
+
},
|
429 |
+
{
|
430 |
+
"cell_type": "code",
|
431 |
+
"execution_count": 10,
|
432 |
+
"metadata": {},
|
433 |
+
"outputs": [],
|
434 |
+
"source": [
|
435 |
+
"# apply the function to the message column\n",
|
436 |
+
"df['message'] = df['message'].apply(remove_punctuation)"
|
437 |
+
]
|
438 |
+
},
|
439 |
+
{
|
440 |
+
"cell_type": "code",
|
441 |
+
"execution_count": 11,
|
442 |
+
"metadata": {},
|
443 |
+
"outputs": [],
|
444 |
+
"source": [
|
445 |
+
"# after removing punctuations, check the length of the message and also description of the message\n",
|
446 |
+
"df['length'] = df['message'].apply(len)"
|
447 |
+
]
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"cell_type": "code",
|
451 |
+
"execution_count": 12,
|
452 |
+
"metadata": {},
|
453 |
+
"outputs": [
|
454 |
+
{
|
455 |
+
"name": "stdout",
|
456 |
+
"output_type": "stream",
|
457 |
+
"text": [
|
458 |
+
"spam\n",
|
459 |
+
"count 746.000000\n",
|
460 |
+
"mean 132.950402\n",
|
461 |
+
"std 27.847503\n",
|
462 |
+
"min 12.000000\n",
|
463 |
+
"25% 127.000000\n",
|
464 |
+
"50% 143.000000\n",
|
465 |
+
"75% 151.000000\n",
|
466 |
+
"max 207.000000\n",
|
467 |
+
"Name: length, dtype: float64\n",
|
468 |
+
"\n",
|
469 |
+
"ham\n",
|
470 |
+
"count 4821.000000\n",
|
471 |
+
"mean 67.506741\n",
|
472 |
+
"std 55.333532\n",
|
473 |
+
"min 1.000000\n",
|
474 |
+
"25% 31.000000\n",
|
475 |
+
"50% 50.000000\n",
|
476 |
+
"75% 88.000000\n",
|
477 |
+
"max 888.000000\n",
|
478 |
+
"Name: length, dtype: float64\n",
|
479 |
+
"\n"
|
480 |
+
]
|
481 |
+
}
|
482 |
+
],
|
483 |
+
"source": [
|
484 |
+
"# check the description of the labels\n",
|
485 |
+
"for i in features:\n",
|
486 |
+
" print(i)\n",
|
487 |
+
" print(df[df['label'] == i]['length'].describe())\n",
|
488 |
+
" print()\n"
|
489 |
+
]
|
490 |
+
},
|
491 |
+
{
|
492 |
+
"attachments": {},
|
493 |
+
"cell_type": "markdown",
|
494 |
+
"metadata": {},
|
495 |
+
"source": [
|
496 |
+
"We need to convert the messages into a vector format. We will use the CountVectorizer class from the sklearn library. We will pass stop_words='english' to remove the stopwords.\n",
|
497 |
+
"\n",
|
498 |
+
"Setting stop_words='english' will exclude a predefined list of English language words that are considered to be stop words. Stop words are words that are commonly used in natural language but typically do not carry significant meaning or context. Examples of stop words in English include \"the\", \"and\", \"a\", \"in\", and \"of\". By excluding these words from the token count, the resulting matrix will be less sparse and more meaningful, as the focus will be on the words that carry more significance and context."
|
499 |
+
]
|
500 |
+
},
|
501 |
+
{
|
502 |
+
"cell_type": "code",
|
503 |
+
"execution_count": 13,
|
504 |
+
"metadata": {},
|
505 |
+
"outputs": [],
|
506 |
+
"source": [
|
507 |
+
"CV = CountVectorizer(stop_words='english')"
|
508 |
+
]
|
509 |
+
},
|
510 |
+
{
|
511 |
+
"cell_type": "code",
|
512 |
+
"execution_count": 14,
|
513 |
+
"metadata": {},
|
514 |
+
"outputs": [],
|
515 |
+
"source": [
|
516 |
+
"# assign the contents of each 'message' to X and 'label' to y\n",
|
517 |
+
"X = df['message'].values\n",
|
518 |
+
"y = df['label'].values"
|
519 |
+
]
|
520 |
+
},
|
521 |
+
{
|
522 |
+
"cell_type": "code",
|
523 |
+
"execution_count": 15,
|
524 |
+
"metadata": {},
|
525 |
+
"outputs": [],
|
526 |
+
"source": [
|
527 |
+
"# split the dataset into train and test\n",
|
528 |
+
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
529 |
+
"\n",
|
530 |
+
"# fit the conutervectorizer transformer to the training data\n",
|
531 |
+
"X_train_CV = CV.fit_transform(X_train)\n",
|
532 |
+
"\n",
|
533 |
+
"# fit the countvectorizer transformer to the testing data\n",
|
534 |
+
"X_test_CV = CV.transform(X_test)\n"
|
535 |
+
]
|
536 |
+
},
|
537 |
+
{
|
538 |
+
"cell_type": "code",
|
539 |
+
"execution_count": 16,
|
540 |
+
"metadata": {},
|
541 |
+
"outputs": [
|
542 |
+
{
|
543 |
+
"name": "stdout",
|
544 |
+
"output_type": "stream",
|
545 |
+
"text": [
|
546 |
+
"Accuracy of MultinomialNB() is: 0.9892280071813285\n",
|
547 |
+
"Confusion Matrix of MultinomialNB() is: [[963 6]\n",
|
548 |
+
" [ 6 139]]\n",
|
549 |
+
"Accuracy of LogisticRegression() is: 0.9829443447037702\n",
|
550 |
+
"Confusion Matrix of LogisticRegression() is: [[969 0]\n",
|
551 |
+
" [ 19 126]]\n",
|
552 |
+
"Accuracy of RandomForestClassifier() is: 0.9820466786355476\n",
|
553 |
+
"Confusion Matrix of RandomForestClassifier() is: [[969 0]\n",
|
554 |
+
" [ 20 125]]\n",
|
555 |
+
"Accuracy of SVC() is: 0.981149012567325\n",
|
556 |
+
"Confusion Matrix of SVC() is: [[968 1]\n",
|
557 |
+
" [ 20 125]]\n",
|
558 |
+
"Accuracy of KNeighborsClassifier() is: 0.9156193895870736\n",
|
559 |
+
"Confusion Matrix of KNeighborsClassifier() is: [[969 0]\n",
|
560 |
+
" [ 94 51]]\n",
|
561 |
+
"Accuracy of DecisionTreeClassifier() is: 0.9685816876122083\n",
|
562 |
+
"Confusion Matrix of DecisionTreeClassifier() is: [[956 13]\n",
|
563 |
+
" [ 22 123]]\n",
|
564 |
+
"Accuracy of AdaBoostClassifier() is: 0.9631956912028725\n",
|
565 |
+
"Confusion Matrix of AdaBoostClassifier() is: [[959 10]\n",
|
566 |
+
" [ 31 114]]\n",
|
567 |
+
"Accuracy of GradientBoostingClassifier() is: 0.9631956912028725\n",
|
568 |
+
"Confusion Matrix of GradientBoostingClassifier() is: [[967 2]\n",
|
569 |
+
" [ 39 106]]\n",
|
570 |
+
"Accuracy of ExtraTreesClassifier() is: 0.9820466786355476\n",
|
571 |
+
"Confusion Matrix of ExtraTreesClassifier() is: [[967 2]\n",
|
572 |
+
" [ 18 127]]\n",
|
573 |
+
"Accuracy of SGDClassifier() is: 0.9847396768402155\n",
|
574 |
+
"Confusion Matrix of SGDClassifier() is: [[968 1]\n",
|
575 |
+
" [ 16 129]]\n",
|
576 |
+
"Accuracy of Perceptron() is: 0.9829443447037702\n",
|
577 |
+
"Confusion Matrix of Perceptron() is: [[958 11]\n",
|
578 |
+
" [ 8 137]]\n",
|
579 |
+
"Accuracy of PassiveAggressiveClassifier() is: 0.9847396768402155\n",
|
580 |
+
"Confusion Matrix of PassiveAggressiveClassifier() is: [[967 2]\n",
|
581 |
+
" [ 15 130]]\n",
|
582 |
+
"Accuracy of MLPClassifier() is: 0.9883303411131059\n",
|
583 |
+
"Confusion Matrix of MLPClassifier() is: [[968 1]\n",
|
584 |
+
" [ 12 133]]\n"
|
585 |
+
]
|
586 |
+
}
|
587 |
+
],
|
588 |
+
"source": [
|
589 |
+
"# create a list of classifiers\n",
|
590 |
+
"classifiers = [\n",
|
591 |
+
" MultinomialNB(),\n",
|
592 |
+
" LogisticRegression(),\n",
|
593 |
+
" RandomForestClassifier(),\n",
|
594 |
+
" SVC(),\n",
|
595 |
+
" KNeighborsClassifier(),\n",
|
596 |
+
" DecisionTreeClassifier(),\n",
|
597 |
+
" AdaBoostClassifier(),\n",
|
598 |
+
" GradientBoostingClassifier(),\n",
|
599 |
+
" ExtraTreesClassifier(),\n",
|
600 |
+
" SGDClassifier(),\n",
|
601 |
+
" Perceptron(),\n",
|
602 |
+
" PassiveAggressiveClassifier(),\n",
|
603 |
+
" MLPClassifier(),\n",
|
604 |
+
"]\n",
|
605 |
+
"\n",
|
606 |
+
"\n",
|
607 |
+
"# create a dataframe to store the accuracy of each classifier\n",
|
608 |
+
"df_acc = pd.DataFrame(columns=['classifier', 'accuracy'])\n",
|
609 |
+
"\n",
|
610 |
+
"# create a dataframe to store the confusion matrix of each classifier\n",
|
611 |
+
"df_cm = pd.DataFrame(columns=['classifier', 'confusion_matrix'])\n",
|
612 |
+
"\n",
|
613 |
+
"# create a function to train the model and store the accuracy, confusion matrix, classification report, f1 score, precision score, recall score, roc_auc score, jaccard score and log loss in the dataframe\n",
|
614 |
+
"\n",
|
615 |
+
"def train_model(classifier):\n",
|
616 |
+
" clf = classifier\n",
|
617 |
+
" clf.fit(X_train_CV, y_train)\n",
|
618 |
+
" y_pred = clf.predict(X_test_CV)\n",
|
619 |
+
" acc = accuracy_score(y_test, y_pred)\n",
|
620 |
+
" cm = confusion_matrix(y_test, y_pred)\n",
|
621 |
+
" print('Accuracy of ' + str(classifier) + ' is: ' + str(acc))\n",
|
622 |
+
" print('Confusion Matrix of ' + str(classifier) + ' is: ' + str(cm))\n",
|
623 |
+
"\n",
|
624 |
+
"# train the model\n",
|
625 |
+
"for classifier in classifiers:\n",
|
626 |
+
" train_model(classifier)\n"
|
627 |
+
]
|
628 |
+
},
|
629 |
+
{
|
630 |
+
"attachments": {},
|
631 |
+
"cell_type": "markdown",
|
632 |
+
"metadata": {},
|
633 |
+
"source": [
|
634 |
+
"We choose the MultinomialNB since it is the one with highest accuracy."
|
635 |
+
]
|
636 |
+
},
|
637 |
+
{
|
638 |
+
"cell_type": "code",
|
639 |
+
"execution_count": 17,
|
640 |
+
"metadata": {},
|
641 |
+
"outputs": [
|
642 |
+
{
|
643 |
+
"data": {
|
644 |
+
"text/html": [
|
645 |
+
"<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>MultinomialNB()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">MultinomialNB</label><div class=\"sk-toggleable__content\"><pre>MultinomialNB()</pre></div></div></div></div></div>"
|
646 |
+
],
|
647 |
+
"text/plain": [
|
648 |
+
"MultinomialNB()"
|
649 |
+
]
|
650 |
+
},
|
651 |
+
"execution_count": 17,
|
652 |
+
"metadata": {},
|
653 |
+
"output_type": "execute_result"
|
654 |
+
}
|
655 |
+
],
|
656 |
+
"source": [
|
657 |
+
"# create an instance of the classifier\n",
|
658 |
+
"NB = MultinomialNB()\n",
|
659 |
+
"\n",
|
660 |
+
"# fit the classifier to the training data\n",
|
661 |
+
"NB.fit(X_train_CV, y_train)"
|
662 |
+
]
|
663 |
+
},
|
664 |
+
{
|
665 |
+
"cell_type": "code",
|
666 |
+
"execution_count": 18,
|
667 |
+
"metadata": {},
|
668 |
+
"outputs": [
|
669 |
+
{
|
670 |
+
"data": {
|
671 |
+
"text/plain": [
|
672 |
+
"98.92280071813285"
|
673 |
+
]
|
674 |
+
},
|
675 |
+
"execution_count": 18,
|
676 |
+
"metadata": {},
|
677 |
+
"output_type": "execute_result"
|
678 |
+
}
|
679 |
+
],
|
680 |
+
"source": [
|
681 |
+
"# test the accuracy with test data\n",
|
682 |
+
"y_pred = NB.predict(X_test_CV)\n",
|
683 |
+
"\n",
|
684 |
+
"# check the accuracy\n",
|
685 |
+
"accuracy_score(y_test, y_pred)*100"
|
686 |
+
]
|
687 |
+
},
|
688 |
+
{
|
689 |
+
"cell_type": "code",
|
690 |
+
"execution_count": 72,
|
691 |
+
"metadata": {},
|
692 |
+
"outputs": [],
|
693 |
+
"source": [
|
694 |
+
"# write a function that will accept user input and will predict if it is a spam or ham. Based on the prediction it will save the result and message in a csv file\n",
|
695 |
+
"def predict_message(message):\n",
|
696 |
+
" message = remove_punctuation(message)\n",
|
697 |
+
" tmessage = CV.transform([message])\n",
|
698 |
+
" prediction = NB.predict(tmessage)\n",
|
699 |
+
" print(prediction)\n",
|
700 |
+
" if prediction == 'spam':\n",
|
701 |
+
" df = pd.read_csv('results/results.tsv', sep='\\t', names = ['label', 'message'])\n",
|
702 |
+
" df = df.append({'label': 'spam', 'message': message}, ignore_index = True)\n",
|
703 |
+
" df.to_csv('results/results.tsv', index=False)\n",
|
704 |
+
" else:\n",
|
705 |
+
" df = pd.read_csv('results/results.tsv', sep='\\t', names = ['label', 'message'])\n",
|
706 |
+
" df = df.append({'label': 'ham', 'message': message}, ignore_index=True)\n",
|
707 |
+
" df.to_csv('results/results.tsv', index=False)\n",
|
708 |
+
" return prediction"
|
709 |
+
]
|
710 |
+
},
|
711 |
+
{
|
712 |
+
"cell_type": "code",
|
713 |
+
"execution_count": 73,
|
714 |
+
"metadata": {},
|
715 |
+
"outputs": [
|
716 |
+
{
|
717 |
+
"name": "stdout",
|
718 |
+
"output_type": "stream",
|
719 |
+
"text": [
|
720 |
+
"['ham']\n"
|
721 |
+
]
|
722 |
+
},
|
723 |
+
{
|
724 |
+
"name": "stderr",
|
725 |
+
"output_type": "stream",
|
726 |
+
"text": [
|
727 |
+
"/tmp/ipykernel_981768/2737347627.py:13: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
|
728 |
+
" df = df.append({'label': 'ham', 'message': message}, ignore_index=True)\n"
|
729 |
+
]
|
730 |
+
},
|
731 |
+
{
|
732 |
+
"data": {
|
733 |
+
"text/plain": [
|
734 |
+
"array(['ham'], dtype='<U4')"
|
735 |
+
]
|
736 |
+
},
|
737 |
+
"execution_count": 73,
|
738 |
+
"metadata": {},
|
739 |
+
"output_type": "execute_result"
|
740 |
+
}
|
741 |
+
],
|
742 |
+
"source": [
|
743 |
+
"predict_message('Good Morning Madam! Have a Nice Day!!')"
|
744 |
+
]
|
745 |
+
},
|
746 |
+
{
|
747 |
+
"cell_type": "code",
|
748 |
+
"execution_count": 2,
|
749 |
+
"metadata": {},
|
750 |
+
"outputs": [
|
751 |
+
{
|
752 |
+
"name": "stdout",
|
753 |
+
"output_type": "stream",
|
754 |
+
"text": [
|
755 |
+
"Defaulting to user installation because normal site-packages is not writeable\n",
|
756 |
+
"Collecting modin[all]\n",
|
757 |
+
" Downloading modin-0.19.0-py3-none-any.whl (1.0 MB)\n",
|
758 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m56.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
|
759 |
+
"\u001b[?25hRequirement already satisfied: numpy>=1.18.5 in /usr/lib/python3.10/site-packages (from modin[all]) (1.23.2)\n",
|
760 |
+
"Collecting fsspec\n",
|
761 |
+
" Downloading fsspec-2023.3.0-py3-none-any.whl (145 kB)\n",
|
762 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m145.4/145.4 kB\u001b[0m \u001b[31m93.5 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
763 |
+
"\u001b[?25hCollecting pandas==1.5.3\n",
|
764 |
+
" Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)\n",
|
765 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.1/12.1 MB\u001b[0m \u001b[31m413.2 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
766 |
+
"\u001b[?25hRequirement already satisfied: psutil in /home/panchajanya/.local/lib/python3.10/site-packages (from modin[all]) (5.9.1)\n",
|
767 |
+
"Requirement already satisfied: packaging in /usr/lib/python3.10/site-packages (from modin[all]) (21.3)\n",
|
768 |
+
"Collecting distributed>=2.22.0\n",
|
769 |
+
" Downloading distributed-2023.3.2-py3-none-any.whl (956 kB)\n",
|
770 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m956.9/956.9 kB\u001b[0m \u001b[31m106.2 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
771 |
+
"\u001b[?25hCollecting ray[default]>=1.13.0\n",
|
772 |
+
" Downloading ray-2.3.1-cp310-cp310-manylinux2014_x86_64.whl (58.5 MB)\n",
|
773 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.5/58.5 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
|
774 |
+
"\u001b[?25hCollecting unidist[mpi]>=0.2.1\n",
|
775 |
+
" Downloading unidist-0.3.0-py3-none-any.whl (104 kB)\n",
|
776 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m104.3/104.3 kB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
777 |
+
"\u001b[?25hCollecting cloudpickle\n",
|
778 |
+
" Downloading cloudpickle-2.2.1-py3-none-any.whl (25 kB)\n",
|
779 |
+
"Collecting boto3\n",
|
780 |
+
" Downloading boto3-1.26.101-py3-none-any.whl (135 kB)\n",
|
781 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m135.5/135.5 kB\u001b[0m \u001b[31m974.7 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
782 |
+
"\u001b[?25hCollecting dask>=2.22.0\n",
|
783 |
+
" Downloading dask-2023.3.2-py3-none-any.whl (1.2 MB)\n",
|
784 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m0m\n",
|
785 |
+
"\u001b[?25hCollecting pyarrow\n",
|
786 |
+
" Downloading pyarrow-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)\n",
|
787 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.9/34.9 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
788 |
+
"\u001b[?25hCollecting modin-spreadsheet>=0.1.0\n",
|
789 |
+
" Downloading modin_spreadsheet-0.1.2-py2.py3-none-any.whl (1.8 MB)\n",
|
790 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
791 |
+
"\u001b[?25hCollecting rpyc==4.1.5\n",
|
792 |
+
" Downloading rpyc-4.1.5-py3-none-any.whl (68 kB)\n",
|
793 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m68.9/68.9 kB\u001b[0m \u001b[31m430.5 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
|
794 |
+
"\u001b[?25hRequirement already satisfied: pytz>=2020.1 in /usr/lib/python3.10/site-packages (from pandas==1.5.3->modin[all]) (2022.7)\n",
|
795 |
+
"Requirement already satisfied: python-dateutil>=2.8.1 in /home/panchajanya/.local/lib/python3.10/site-packages (from pandas==1.5.3->modin[all]) (2.8.2)\n",
|
796 |
+
"Collecting plumbum\n",
|
797 |
+
" Downloading plumbum-1.8.1-py3-none-any.whl (126 kB)\n",
|
798 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m126.7/126.7 kB\u001b[0m \u001b[31m517.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
799 |
+
"\u001b[?25hRequirement already satisfied: pyyaml>=5.3.1 in /usr/lib/python3.10/site-packages (from dask>=2.22.0->modin[all]) (6.0)\n",
|
800 |
+
"Collecting toolz>=0.8.2\n",
|
801 |
+
" Downloading toolz-0.12.0-py3-none-any.whl (55 kB)\n",
|
802 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.8/55.8 kB\u001b[0m \u001b[31m186.2 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
803 |
+
"\u001b[?25hCollecting importlib-metadata>=4.13.0\n",
|
804 |
+
" Downloading importlib_metadata-6.1.0-py3-none-any.whl (21 kB)\n",
|
805 |
+
"Requirement already satisfied: click>=7.0 in /home/panchajanya/.local/lib/python3.10/site-packages (from dask>=2.22.0->modin[all]) (8.1.3)\n",
|
806 |
+
"Collecting partd>=1.2.0\n",
|
807 |
+
" Downloading partd-1.3.0-py3-none-any.whl (18 kB)\n",
|
808 |
+
"Collecting tblib>=1.6.0\n",
|
809 |
+
" Downloading tblib-1.7.0-py2.py3-none-any.whl (12 kB)\n",
|
810 |
+
"Requirement already satisfied: tornado>=6.0.3 in /home/panchajanya/.local/lib/python3.10/site-packages (from distributed>=2.22.0->modin[all]) (6.1)\n",
|
811 |
+
"Collecting zict>=2.1.0\n",
|
812 |
+
" Downloading zict-2.2.0-py2.py3-none-any.whl (23 kB)\n",
|
813 |
+
"Collecting sortedcontainers>=2.0.5\n",
|
814 |
+
" Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)\n",
|
815 |
+
"Requirement already satisfied: urllib3>=1.24.3 in /usr/lib/python3.10/site-packages (from distributed>=2.22.0->modin[all]) (1.26.12)\n",
|
816 |
+
"Requirement already satisfied: jinja2>=2.10.3 in /usr/lib/python3.10/site-packages (from distributed>=2.22.0->modin[all]) (3.1.2)\n",
|
817 |
+
"Collecting locket>=1.0.0\n",
|
818 |
+
" Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)\n",
|
819 |
+
"Requirement already satisfied: msgpack>=1.0.0 in /usr/lib/python3.10/site-packages (from distributed>=2.22.0->modin[all]) (1.0.4)\n",
|
820 |
+
"Requirement already satisfied: jupyter>=1.0.0 in /usr/lib/python3.10/site-packages (from modin-spreadsheet>=0.1.0->modin[all]) (1.0.0)\n",
|
821 |
+
"Requirement already satisfied: ipywidgets>=7.0.0 in /usr/lib/python3.10/site-packages (from modin-spreadsheet>=0.1.0->modin[all]) (8.0.2)\n",
|
822 |
+
"Requirement already satisfied: notebook>=6.0.3 in /usr/lib/python3.10/site-packages (from modin-spreadsheet>=0.1.0->modin[all]) (6.4.12)\n",
|
823 |
+
"Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/lib/python3.10/site-packages (from packaging->modin[all]) (3.0.9)\n",
|
824 |
+
"Requirement already satisfied: jsonschema in /usr/lib/python3.10/site-packages (from ray[default]>=1.13.0->modin[all]) (4.15.0)\n",
|
825 |
+
"Requirement already satisfied: aiosignal in /home/panchajanya/.local/lib/python3.10/site-packages (from ray[default]>=1.13.0->modin[all]) (1.3.1)\n",
|
826 |
+
"Requirement already satisfied: protobuf!=3.19.5,>=3.15.3 in /usr/lib/python3.10/site-packages (from ray[default]>=1.13.0->modin[all]) (4.21.12)\n",
|
827 |
+
"Collecting grpcio>=1.42.0\n",
|
828 |
+
" Downloading grpcio-1.53.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.0 MB)\n",
|
829 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m266.5 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
830 |
+
"\u001b[?25hCollecting virtualenv>=20.0.24\n",
|
831 |
+
" Downloading virtualenv-20.21.0-py3-none-any.whl (8.7 MB)\n",
|
832 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.7/8.7 MB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m0m\n",
|
833 |
+
"\u001b[?25hRequirement already satisfied: attrs in /usr/lib/python3.10/site-packages (from ray[default]>=1.13.0->modin[all]) (22.2.0)\n",
|
834 |
+
"Requirement already satisfied: frozenlist in /home/panchajanya/.local/lib/python3.10/site-packages (from ray[default]>=1.13.0->modin[all]) (1.3.3)\n",
|
835 |
+
"Requirement already satisfied: requests in /usr/lib/python3.10/site-packages (from ray[default]>=1.13.0->modin[all]) (2.28.1)\n",
|
836 |
+
"Collecting filelock\n",
|
837 |
+
" Downloading filelock-3.10.7-py3-none-any.whl (10 kB)\n",
|
838 |
+
"Requirement already satisfied: pydantic in /usr/lib/python3.10/site-packages (from ray[default]>=1.13.0->modin[all]) (1.10.4)\n",
|
839 |
+
"Collecting smart-open\n",
|
840 |
+
" Downloading smart_open-6.3.0-py3-none-any.whl (56 kB)\n",
|
841 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.8/56.8 kB\u001b[0m \u001b[31m581.0 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
|
842 |
+
"\u001b[?25hRequirement already satisfied: prometheus-client>=0.7.1 in /usr/lib/python3.10/site-packages (from ray[default]>=1.13.0->modin[all]) (0.14.1)\n",
|
843 |
+
"Requirement already satisfied: aiohttp>=3.7 in /home/panchajanya/.local/lib/python3.10/site-packages (from ray[default]>=1.13.0->modin[all]) (3.8.4)\n",
|
844 |
+
"Collecting aiohttp-cors\n",
|
845 |
+
" Downloading aiohttp_cors-0.7.0-py3-none-any.whl (27 kB)\n",
|
846 |
+
"Collecting gpustat>=1.0.0\n",
|
847 |
+
" Downloading gpustat-1.0.0.tar.gz (90 kB)\n",
|
848 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m90.5/90.5 kB\u001b[0m \u001b[31m897.9 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
|
849 |
+
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
850 |
+
"\u001b[?25hCollecting py-spy>=0.2.0\n",
|
851 |
+
" Downloading py_spy-0.3.14-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (3.0 MB)\n",
|
852 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
853 |
+
"\u001b[?25hCollecting opencensus\n",
|
854 |
+
" Downloading opencensus-0.11.2-py2.py3-none-any.whl (128 kB)\n",
|
855 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m128.2/128.2 kB\u001b[0m \u001b[31m672.2 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
856 |
+
"\u001b[?25hCollecting colorful\n",
|
857 |
+
" Downloading colorful-0.5.5-py2.py3-none-any.whl (201 kB)\n",
|
858 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m201.4/201.4 kB\u001b[0m \u001b[31m939.8 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
859 |
+
"\u001b[?25hCollecting mpi4py-mpich\n",
|
860 |
+
" Downloading mpi4py_mpich-3.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.3 MB)\n",
|
861 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.3/6.3 MB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m0m\n",
|
862 |
+
"\u001b[?25hCollecting jmespath<2.0.0,>=0.7.1\n",
|
863 |
+
" Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
|
864 |
+
"Collecting botocore<1.30.0,>=1.29.101\n",
|
865 |
+
" Downloading botocore-1.29.101-py3-none-any.whl (10.6 MB)\n",
|
866 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.6/10.6 MB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
|
867 |
+
"\u001b[?25hCollecting s3transfer<0.7.0,>=0.6.0\n",
|
868 |
+
" Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)\n",
|
869 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.6/79.6 kB\u001b[0m \u001b[31m211.3 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
870 |
+
"\u001b[?25hRequirement already satisfied: charset-normalizer<4.0,>=2.0 in /home/panchajanya/.local/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]>=1.13.0->modin[all]) (3.1.0)\n",
|
871 |
+
"Requirement already satisfied: multidict<7.0,>=4.5 in /home/panchajanya/.local/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]>=1.13.0->modin[all]) (6.0.4)\n",
|
872 |
+
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/panchajanya/.local/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]>=1.13.0->modin[all]) (4.0.2)\n",
|
873 |
+
"Requirement already satisfied: yarl<2.0,>=1.0 in /home/panchajanya/.local/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]>=1.13.0->modin[all]) (1.8.2)\n",
|
874 |
+
"Requirement already satisfied: six>=1.7 in /usr/lib/python3.10/site-packages (from gpustat>=1.0.0->ray[default]>=1.13.0->modin[all]) (1.16.0)\n",
|
875 |
+
"Collecting nvidia-ml-py<=11.495.46,>=11.450.129\n",
|
876 |
+
" Downloading nvidia_ml_py-11.495.46-py3-none-any.whl (25 kB)\n",
|
877 |
+
"Collecting blessed>=1.17.1\n",
|
878 |
+
" Downloading blessed-1.20.0-py2.py3-none-any.whl (58 kB)\n",
|
879 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.4/58.4 kB\u001b[0m \u001b[31m153.0 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
880 |
+
"\u001b[?25hCollecting zipp>=0.5\n",
|
881 |
+
" Downloading zipp-3.15.0-py3-none-any.whl (6.8 kB)\n",
|
882 |
+
"Requirement already satisfied: ipykernel>=4.5.1 in /home/panchajanya/.local/lib/python3.10/site-packages (from ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (6.15.0)\n",
|
883 |
+
"Requirement already satisfied: traitlets>=4.3.1 in /usr/lib/python3.10/site-packages (from ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (5.9.0)\n",
|
884 |
+
"Requirement already satisfied: jupyterlab-widgets~=3.0 in /usr/lib/python3.10/site-packages (from ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (3.0.3)\n",
|
885 |
+
"Requirement already satisfied: ipython>=6.1.0 in /usr/lib/python3.10/site-packages (from ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (8.9.0)\n",
|
886 |
+
"Requirement already satisfied: widgetsnbextension~=4.0 in /usr/lib/python3.10/site-packages (from ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (4.0.3)\n",
|
887 |
+
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/lib/python3.10/site-packages (from jinja2>=2.10.3->distributed>=2.22.0->modin[all]) (2.1.1)\n",
|
888 |
+
"Requirement already satisfied: nbconvert in /usr/lib/python3.10/site-packages (from jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (7.0.0)\n",
|
889 |
+
"Requirement already satisfied: jupyter-console in /usr/lib/python3.10/site-packages (from jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (6.4.4)\n",
|
890 |
+
"Requirement already satisfied: qtconsole in /usr/lib/python3.10/site-packages (from jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (5.3.2)\n",
|
891 |
+
"Requirement already satisfied: argon2-cffi in /usr/lib/python3.10/site-packages (from notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (21.3.0)\n",
|
892 |
+
"Requirement already satisfied: nbformat in /usr/lib/python3.10/site-packages (from notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (5.4.0)\n",
|
893 |
+
"Requirement already satisfied: terminado>=0.8.3 in /usr/lib/python3.10/site-packages (from notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (0.15.0)\n",
|
894 |
+
"Requirement already satisfied: jupyter-core>=4.6.1 in /home/panchajanya/.local/lib/python3.10/site-packages (from notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (4.10.0)\n",
|
895 |
+
"Requirement already satisfied: pyzmq>=17 in /home/panchajanya/.local/lib/python3.10/site-packages (from notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (23.1.0)\n",
|
896 |
+
"Requirement already satisfied: ipython-genutils in /usr/lib/python3.10/site-packages (from notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (0.2.0)\n",
|
897 |
+
"Requirement already satisfied: nest-asyncio>=1.5 in /home/panchajanya/.local/lib/python3.10/site-packages (from notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (1.5.5)\n",
|
898 |
+
"Requirement already satisfied: Send2Trash>=1.8.0 in /usr/lib/python3.10/site-packages (from notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (1.8.0)\n",
|
899 |
+
"Requirement already satisfied: jupyter-client>=5.3.4 in /home/panchajanya/.local/lib/python3.10/site-packages (from notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (7.3.4)\n",
|
900 |
+
"Requirement already satisfied: distlib<1,>=0.3.6 in /usr/lib/python3.10/site-packages (from virtualenv>=20.0.24->ray[default]>=1.13.0->modin[all]) (0.3.6)\n",
|
901 |
+
"Requirement already satisfied: platformdirs<4,>=2.4 in /usr/lib/python3.10/site-packages (from virtualenv>=20.0.24->ray[default]>=1.13.0->modin[all]) (2.6.2)\n",
|
902 |
+
"Collecting heapdict\n",
|
903 |
+
" Downloading HeapDict-1.0.1-py3-none-any.whl (3.9 kB)\n",
|
904 |
+
"Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/lib/python3.10/site-packages (from jsonschema->ray[default]>=1.13.0->modin[all]) (0.18.1)\n",
|
905 |
+
"Collecting opencensus-context>=0.1.3\n",
|
906 |
+
" Downloading opencensus_context-0.1.3-py2.py3-none-any.whl (5.1 kB)\n",
|
907 |
+
"Requirement already satisfied: google-api-core<3.0.0,>=1.0.0 in /usr/lib/python3.10/site-packages (from opencensus->ray[default]>=1.13.0->modin[all]) (2.11.0)\n",
|
908 |
+
"Requirement already satisfied: typing-extensions>=4.2.0 in /usr/lib/python3.10/site-packages (from pydantic->ray[default]>=1.13.0->modin[all]) (4.4.0)\n",
|
909 |
+
"Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3.10/site-packages (from requests->ray[default]>=1.13.0->modin[all]) (3.4)\n",
|
910 |
+
"Requirement already satisfied: wcwidth>=0.1.4 in /usr/lib/python3.10/site-packages (from blessed>=1.17.1->gpustat>=1.0.0->ray[default]>=1.13.0->modin[all]) (0.2.5)\n",
|
911 |
+
"Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /usr/lib/python3.10/site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]>=1.13.0->modin[all]) (1.58.0)\n",
|
912 |
+
"Requirement already satisfied: google-auth<3.0dev,>=2.14.1 in /usr/lib/python3.10/site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]>=1.13.0->modin[all]) (2.16.0)\n",
|
913 |
+
"Requirement already satisfied: matplotlib-inline>=0.1 in /usr/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.1.6)\n",
|
914 |
+
"Requirement already satisfied: debugpy>=1.0 in /home/panchajanya/.local/lib/python3.10/site-packages (from ipykernel>=4.5.1->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (1.6.0)\n",
|
915 |
+
"Requirement already satisfied: stack-data in /usr/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.6.2)\n",
|
916 |
+
"Requirement already satisfied: decorator in /usr/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (5.1.1)\n",
|
917 |
+
"Requirement already satisfied: pickleshare in /usr/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.7.5)\n",
|
918 |
+
"Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.30 in /usr/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (3.0.36)\n",
|
919 |
+
"Requirement already satisfied: pygments>=2.4.0 in /usr/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (2.14.0)\n",
|
920 |
+
"Requirement already satisfied: pexpect>4.3 in /usr/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (4.8.0)\n",
|
921 |
+
"Requirement already satisfied: jedi>=0.16 in /usr/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.18.2)\n",
|
922 |
+
"Requirement already satisfied: backcall in /usr/lib/python3.10/site-packages (from ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.2.0)\n",
|
923 |
+
"Requirement already satisfied: entrypoints in /home/panchajanya/.local/lib/python3.10/site-packages (from jupyter-client>=5.3.4->notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (0.4)\n",
|
924 |
+
"Requirement already satisfied: mistune<3,>=2.0.3 in /usr/lib/python3.10/site-packages (from nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (2.0.4)\n",
|
925 |
+
"Requirement already satisfied: defusedxml in /usr/lib/python3.10/site-packages (from nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.7.1)\n",
|
926 |
+
"Requirement already satisfied: tinycss2 in /usr/lib/python3.10/site-packages (from nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (1.1.1)\n",
|
927 |
+
"Requirement already satisfied: jupyterlab-pygments in /usr/lib/python3.10/site-packages (from nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.2.2)\n",
|
928 |
+
"Requirement already satisfied: bleach in /usr/lib/python3.10/site-packages (from nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (5.0.1)\n",
|
929 |
+
"Requirement already satisfied: pandocfilters>=1.4.1 in /usr/lib/python3.10/site-packages (from nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (1.5.0)\n",
|
930 |
+
"Requirement already satisfied: nbclient>=0.5.0 in /usr/lib/python3.10/site-packages (from nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.6.7)\n",
|
931 |
+
"Requirement already satisfied: beautifulsoup4 in /usr/lib/python3.10/site-packages (from nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (4.11.1)\n",
|
932 |
+
"Requirement already satisfied: lxml in /usr/lib/python3.10/site-packages (from nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (4.9.2)\n",
|
933 |
+
"Requirement already satisfied: fastjsonschema in /usr/lib/python3.10/site-packages (from nbformat->notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (2.16.2)\n",
|
934 |
+
"Requirement already satisfied: ptyprocess in /usr/lib/python3.10/site-packages (from terminado>=0.8.3->notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (0.7.0)\n",
|
935 |
+
"Requirement already satisfied: argon2-cffi-bindings in /usr/lib/python3.10/site-packages (from argon2-cffi->notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (21.2.0)\n",
|
936 |
+
"Requirement already satisfied: qtpy>=2.0.1 in /usr/lib/python3.10/site-packages (from qtconsole->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (2.2.0)\n",
|
937 |
+
"Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/lib/python3.10/site-packages (from google-auth<3.0dev,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]>=1.13.0->modin[all]) (5.3.0)\n",
|
938 |
+
"Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/lib/python3.10/site-packages (from google-auth<3.0dev,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]>=1.13.0->modin[all]) (0.2.8)\n",
|
939 |
+
"Requirement already satisfied: rsa<5,>=3.1.4 in /usr/lib/python3.10/site-packages (from google-auth<3.0dev,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]>=1.13.0->modin[all]) (4.9)\n",
|
940 |
+
"Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/lib/python3.10/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.8.3)\n",
|
941 |
+
"Requirement already satisfied: cffi>=1.0.1 in /usr/lib/python3.10/site-packages (from argon2-cffi-bindings->argon2-cffi->notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (1.15.1)\n",
|
942 |
+
"Requirement already satisfied: soupsieve>1.2 in /usr/lib/python3.10/site-packages (from beautifulsoup4->nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (2.3.2.post1)\n",
|
943 |
+
"Requirement already satisfied: webencodings in /usr/lib/python3.10/site-packages (from bleach->nbconvert->jupyter>=1.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.5.1)\n",
|
944 |
+
"Requirement already satisfied: asttokens>=2.1.0 in /usr/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (2.2.1)\n",
|
945 |
+
"Requirement already satisfied: pure-eval in /usr/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (0.2.2)\n",
|
946 |
+
"Requirement already satisfied: executing>=1.2.0 in /usr/lib/python3.10/site-packages (from stack-data->ipython>=6.1.0->ipywidgets>=7.0.0->modin-spreadsheet>=0.1.0->modin[all]) (1.2.0)\n",
|
947 |
+
"Requirement already satisfied: pycparser in /usr/lib/python3.10/site-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi->notebook>=6.0.3->modin-spreadsheet>=0.1.0->modin[all]) (2.21)\n",
|
948 |
+
"Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]>=1.13.0->modin[all]) (0.4.8)\n",
|
949 |
+
"Building wheels for collected packages: gpustat\n",
|
950 |
+
" Building wheel for gpustat (setup.py) ... \u001b[?25ldone\n",
|
951 |
+
"\u001b[?25h Created wheel for gpustat: filename=gpustat-1.0.0-py3-none-any.whl size=19867 sha256=0b87d3b70c7f45e1ebd29b9440afce00b24149ffa1d6b3c556f22f914a8ec725\n",
|
952 |
+
" Stored in directory: /home/panchajanya/.cache/pip/wheels/ff/6f/e4/a4ad313e84cc4786f48fb7196aae415c5640c6fa43f9aabcd4\n",
|
953 |
+
"Successfully built gpustat\n",
|
954 |
+
"Installing collected packages: sortedcontainers, py-spy, opencensus-context, nvidia-ml-py, heapdict, colorful, zipp, zict, toolz, tblib, smart-open, pyarrow, plumbum, mpi4py-mpich, locket, jmespath, grpcio, fsspec, filelock, cloudpickle, blessed, virtualenv, unidist, rpyc, partd, pandas, importlib-metadata, gpustat, botocore, s3transfer, ray, modin, dask, aiohttp-cors, opencensus, distributed, boto3, modin-spreadsheet\n",
|
955 |
+
"\u001b[33m WARNING: The script plasma_store is installed in '/home/panchajanya/.local/bin' which is not on PATH.\n",
|
956 |
+
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
|
957 |
+
"\u001b[0m\u001b[33m WARNING: The script mpiexec is installed in '/home/panchajanya/.local/bin' which is not on PATH.\n",
|
958 |
+
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
|
959 |
+
"\u001b[0m\u001b[33m WARNING: The script virtualenv is installed in '/home/panchajanya/.local/bin' which is not on PATH.\n",
|
960 |
+
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
|
961 |
+
"\u001b[0m\u001b[33m WARNING: The script gpustat is installed in '/home/panchajanya/.local/bin' which is not on PATH.\n",
|
962 |
+
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
|
963 |
+
"\u001b[0m\u001b[33m WARNING: The scripts ray, rllib, serve and tune are installed in '/home/panchajanya/.local/bin' which is not on PATH.\n",
|
964 |
+
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
|
965 |
+
"\u001b[0m\u001b[33m WARNING: The script dask is installed in '/home/panchajanya/.local/bin' which is not on PATH.\n",
|
966 |
+
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
|
967 |
+
"\u001b[0m\u001b[33m WARNING: The scripts dask-scheduler, dask-ssh and dask-worker are installed in '/home/panchajanya/.local/bin' which is not on PATH.\n",
|
968 |
+
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\u001b[33m\n",
|
969 |
+
"\u001b[0mSuccessfully installed aiohttp-cors-0.7.0 blessed-1.20.0 boto3-1.26.101 botocore-1.29.101 cloudpickle-2.2.1 colorful-0.5.5 dask-2023.3.2 distributed-2023.3.2 filelock-3.10.7 fsspec-2023.3.0 gpustat-1.0.0 grpcio-1.53.0 heapdict-1.0.1 importlib-metadata-6.1.0 jmespath-1.0.1 locket-1.0.0 modin-0.19.0 modin-spreadsheet-0.1.2 mpi4py-mpich-3.1.2 nvidia-ml-py-11.495.46 opencensus-0.11.2 opencensus-context-0.1.3 pandas-1.5.3 partd-1.3.0 plumbum-1.8.1 py-spy-0.3.14 pyarrow-11.0.0 ray-2.3.1 rpyc-4.1.5 s3transfer-0.6.0 smart-open-6.3.0 sortedcontainers-2.4.0 tblib-1.7.0 toolz-0.12.0 unidist-0.3.0 virtualenv-20.21.0 zict-2.2.0 zipp-3.15.0\n"
|
970 |
+
]
|
971 |
+
}
|
972 |
+
],
|
973 |
+
"source": [
|
974 |
+
"!pip install \"modin[all]\""
|
975 |
+
]
|
976 |
+
},
|
977 |
+
{
|
978 |
+
"cell_type": "code",
|
979 |
+
"execution_count": null,
|
980 |
+
"metadata": {},
|
981 |
+
"outputs": [],
|
982 |
+
"source": []
|
983 |
+
}
|
984 |
+
],
|
985 |
+
"metadata": {
|
986 |
+
"kernelspec": {
|
987 |
+
"display_name": "Python 3 (ipykernel)",
|
988 |
+
"language": "python",
|
989 |
+
"name": "python3"
|
990 |
+
},
|
991 |
+
"language_info": {
|
992 |
+
"codemirror_mode": {
|
993 |
+
"name": "ipython",
|
994 |
+
"version": 3
|
995 |
+
},
|
996 |
+
"file_extension": ".py",
|
997 |
+
"mimetype": "text/x-python",
|
998 |
+
"name": "python",
|
999 |
+
"nbconvert_exporter": "python",
|
1000 |
+
"pygments_lexer": "ipython3",
|
1001 |
+
"version": "3.10.9"
|
1002 |
+
},
|
1003 |
+
"orig_nbformat": 4,
|
1004 |
+
"vscode": {
|
1005 |
+
"interpreter": {
|
1006 |
+
"hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a"
|
1007 |
+
}
|
1008 |
+
}
|
1009 |
+
},
|
1010 |
+
"nbformat": 4,
|
1011 |
+
"nbformat_minor": 2
|
1012 |
+
}
|
spam_classifier.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import libraries
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
# import string
|
8 |
+
import string
|
9 |
+
|
10 |
+
# import countvectorizer
|
11 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
12 |
+
|
13 |
+
# import train_test_split
|
14 |
+
from sklearn.model_selection import train_test_split
|
15 |
+
|
16 |
+
# import multinomial naive bayes
|
17 |
+
from sklearn.naive_bayes import MultinomialNB
|
18 |
+
|
19 |
+
# read data from csv file
|
20 |
+
df = pd.read_csv('dataset/spam.tsv', sep='\t', names=['label', 'message'])
|
21 |
+
|
22 |
+
# check features
|
23 |
+
features = ['spam', 'ham']
|
24 |
+
|
25 |
+
# write a function to remove punctuations from meassages
|
26 |
+
def remove_punctuation(text):
|
27 |
+
no_punct = [char for char in text if char not in string.punctuation]
|
28 |
+
no_punct = ''.join(no_punct)
|
29 |
+
return no_punct
|
30 |
+
|
31 |
+
# apply the function to the message column
|
32 |
+
df['message'] = df['message'].apply(remove_punctuation)
|
33 |
+
|
34 |
+
# after removing punctuations, check the length of the message and also description of the message
|
35 |
+
df['length'] = df['message'].apply(len)
|
36 |
+
|
37 |
+
# apply countvectorizer to the message column
|
38 |
+
CV = CountVectorizer(stop_words='english')
|
39 |
+
|
40 |
+
# assign the contents of each 'message' to X and 'label' to y
|
41 |
+
X = df['message'].values
|
42 |
+
y = df['label'].values
|
43 |
+
|
44 |
+
# split the dataset into train and test
|
45 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
46 |
+
|
47 |
+
# fit the conutervectorizer transformer to the training data
|
48 |
+
X_train_CV = CV.fit_transform(X_train)
|
49 |
+
|
50 |
+
# fit the countvectorizer transformer to the testing data
|
51 |
+
X_test_CV = CV.transform(X_test)
|
52 |
+
|
53 |
+
# create an instance of the classifier
|
54 |
+
NB = MultinomialNB()
|
55 |
+
|
56 |
+
# fit the classifier to the training data
|
57 |
+
NB.fit(X_train_CV, y_train)
|
58 |
+
|
59 |
+
# test the accuracy with test data
|
60 |
+
y_pred = NB.predict(X_test_CV)
|
61 |
+
|
62 |
+
# write a function that will take a string as input and return the prediction
|
63 |
+
def predict_spam(message):
|
64 |
+
message = CV.transform([message])
|
65 |
+
prediction = NB.predict(message)
|
66 |
+
if prediction == 'ham':
|
67 |
+
message = 'This is a ham message'
|
68 |
+
else:
|
69 |
+
message = 'This is a spam message'
|
70 |
+
return message
|
71 |
+
|
72 |
+
iface = gr.Interface(
|
73 |
+
fn=predict_spam,
|
74 |
+
inputs=gr.Textbox(lines=2, placeholder="Enter a message to check if it is spam or ham", label="Message", info = "Enter a message"),
|
75 |
+
outputs=gr.Textbox(lines=2, info="Check if the enetered message is spam or ham", label="Prediction", placeholder = "Output will be here.."),
|
76 |
+
title="Spam Classifier",
|
77 |
+
description="Enter a message to check if it is spam or ham",
|
78 |
+
allow_flagging='never',
|
79 |
+
examples=[['Hey, how are you doing?'], ['Congratulations! You have won a free trip to Dubai!']])
|
80 |
+
iface.launch()
|