Maslov-Artem commited on
Commit
9f7701f
·
1 Parent(s): 352ab25

Add logreg

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. app.py +44 -0
  3. logreg_model.pkl +3 -0
  4. preprocessing.py +31 -0
  5. requirements.txt +199 -0
  6. vectorizer.pkl +3 -0
.gitignore CHANGED
@@ -1 +1,3 @@
1
  .venv
 
 
 
1
  .venv
2
+ healthcare_facilities_reviews.jsonl
3
+ nlp_models.ipynb
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+
3
+ import streamlit as st
4
+
5
+ from preprocessing import data_preprocessing
6
+
7
+ # Load preprocessing steps
8
+ with open("vectorizer.pkl", "rb") as f:
9
+ vectorizer = pickle.load(f)
10
+
11
+ # Load trained model
12
+ with open("logreg_model.pkl", "rb") as f:
13
+ logreg = pickle.load(f)
14
+
15
+
16
+ # Define function for preprocessing input text
17
+ def preprocess_text(text):
18
+ # Apply preprocessing steps (cleaning, tokenization, vectorization)
19
+ clean_text = data_preprocessing(
20
+ text
21
+ ) # Assuming data_preprocessing is your preprocessing function
22
+ print("Clean text ", clean_text)
23
+ vectorized_text = vectorizer.transform([" ".join(clean_text)])
24
+ return vectorized_text
25
+
26
+
27
+ # Define function for making predictions
28
+ def predict_sentiment(text):
29
+ # Preprocess input text
30
+ processed_text = preprocess_text(text)
31
+ print(preprocess_text)
32
+ # Make prediction
33
+ prediction = logreg.predict(processed_text)
34
+ return prediction
35
+
36
+
37
+ # Streamlit app code
38
+ st.title("Sentiment Analysis with Logistic Regression")
39
+ text_input = st.text_input("Enter your review:")
40
+ if st.button("Predict"):
41
+ st.write("Knopka")
42
+ prediction = predict_sentiment(text_input)
43
+ st.write("prediction")
44
+ st.write("Predicted Sentiment:", prediction)
logreg_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d996382921eccfb01cb81804f5b0ccd5e1ff18939b3d8e63fefadb2b2cc6053
3
+ size 664190
preprocessing.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+
4
+ import pymorphy2
5
+ from nltk.corpus import stopwords
6
+ from nltk.tokenize import word_tokenize
7
+
8
+ stop_words = set(stopwords.words("russian"))
9
+
10
+
11
+ def clean_text(text: str) -> str:
12
+ text = text.lower()
13
+ text = re.sub(r"\w*(\w)\1{2,}\w*", "", text)
14
+ text = re.sub(r"\d+\w*", "", text)
15
+ text = re.sub(r"\[.*?\]", "", text)
16
+ text = text.translate(str.maketrans("", "", string.punctuation))
17
+ return text
18
+
19
+
20
+ def lemmize_and_tokenize_text(text: str) -> list[str]:
21
+ morph = pymorphy2.MorphAnalyzer()
22
+ tokens = word_tokenize(text)
23
+ tokens = [token for token in tokens if token not in stop_words]
24
+ lemmas = [morph.parse(token)[0].normal_form for token in tokens]
25
+ return lemmas
26
+
27
+
28
+ def data_preprocessing(text: str) -> list[str]:
29
+ cleaned_text = clean_text(text)
30
+ lemmized_text = lemmize_and_tokenize_text(cleaned_text)
31
+ return lemmized_text
requirements.txt CHANGED
@@ -86,3 +86,202 @@ validators==0.22.0
86
  watchdog==4.0.0
87
  wcwidth==0.2.13
88
  zipp==3.17.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  watchdog==4.0.0
87
  wcwidth==0.2.13
88
  zipp==3.17.0
89
+ absl-py==2.1.0
90
+ aiofiles==23.2.1
91
+ aiogram==3.3.0
92
+ aiohttp==3.9.1
93
+ aiosignal==1.3.1
94
+ altair==5.2.0
95
+ annotated-types==0.6.0
96
+ antlr4-python3-runtime==4.9.3
97
+ appdirs==1.4.4
98
+ appnope==0.1.4
99
+ asttokens==2.4.1
100
+ astunparse==1.6.3
101
+ attrs==23.2.0
102
+ backcall==0.2.0
103
+ beautifulsoup4==4.12.2
104
+ black==24.2.0
105
+ bleach==6.1.0
106
+ blinker==1.7.0
107
+ cachetools==5.3.2
108
+ category-encoders==2.6.3
109
+ certifi==2023.7.22
110
+ chardet==4.0.0
111
+ charset-normalizer==3.3.2
112
+ click==8.1.7
113
+ cloudpickle==3.0.0
114
+ contourpy==1.2.0
115
+ cycler==0.10.0
116
+ DAWG-Python==0.7.2
117
+ decorator==5.1.1
118
+ defusedxml==0.7.1
119
+ detectron2 @ git+https://github.com/facebookresearch/detectron2.git@3ff5dd1cff4417af07097064813c9f28d7461d3c
120
+ dm-tree==0.1.8
121
+ docopt==0.6.2
122
+ executing==2.0.1
123
+ fastjsonschema==2.19.1
124
+ filelock==3.13.1
125
+ flatbuffers==23.5.26
126
+ fonttools==4.47.2
127
+ frozendict==2.4.0
128
+ frozenlist==1.4.1
129
+ fsspec==2024.2.0
130
+ fvcore==0.1.5.post20221221
131
+ gast==0.5.4
132
+ gitdb==4.0.11
133
+ GitPython==3.1.41
134
+ google-pasta==0.2.0
135
+ grpcio==1.62.0
136
+ h5py==3.10.0
137
+ html5lib==1.1
138
+ hydra-core==1.3.2
139
+ idna==2.10
140
+ imageio==2.33.1
141
+ importlib-metadata==7.0.1
142
+ importlib-resources==6.1.1
143
+ iopath==0.1.9
144
+ ipython==8.12.3
145
+ iso8601==2.1.0
146
+ jedi==0.19.1
147
+ Jinja2==3.1.3
148
+ joblib==1.3.2
149
+ jsonschema==4.20.0
150
+ jsonschema-specifications==2023.12.1
151
+ jupyter_client==8.6.0
152
+ jupyter_core==5.7.1
153
+ jupyterlab_pygments==0.3.0
154
+ kaggle==1.6.6
155
+ keras==3.0.5
156
+ kiwisolver==1.4.5
157
+ lazy_loader==0.3
158
+ learn==1.0.0
159
+ libclang==16.0.6
160
+ llvmlite==0.42.0
161
+ lxml==5.1.0
162
+ magic-filter==1.0.12
163
+ Markdown==3.5.2
164
+ markdown-it-py==3.0.0
165
+ MarkupSafe==2.1.3
166
+ matplotlib==3.8.2
167
+ matplotlib-inline==0.1.6
168
+ mdurl==0.1.2
169
+ mistune==3.0.2
170
+ ml-dtypes==0.3.2
171
+ mplcyberpunk==0.7.1
172
+ mpmath==1.3.0
173
+ multidict==6.0.4
174
+ multitasking==0.0.11
175
+ mypy-extensions==1.0.0
176
+ namex==0.0.7
177
+ nbclient==0.9.0
178
+ nbconvert==7.14.2
179
+ nbformat==5.9.2
180
+ networkx==3.2.1
181
+ nltk==3.8.1
182
+ numba==0.59.0
183
+ numpy==1.26.3
184
+ omegaconf==2.3.0
185
+ opencv-python==4.9.0.80
186
+ opencv-python-headless==4.8.0.74
187
+ opt-einsum==3.3.0
188
+ packaging==23.2
189
+ pandas==2.1.4
190
+ pandocfilters==1.5.0
191
+ parso==0.8.3
192
+ pathspec==0.12.1
193
+ patsy==0.5.6
194
+ peewee==3.17.0
195
+ pexpect==4.9.0
196
+ pickleshare==0.7.5
197
+ pillow==10.2.0
198
+ pipreqs==0.5.0
199
+ platformdirs==4.1.0
200
+ portalocker==2.8.2
201
+ prompt-toolkit==3.0.43
202
+ protobuf==4.25.2
203
+ psutil==5.9.8
204
+ ptyprocess==0.7.0
205
+ pure-eval==0.2.2
206
+ py-cpuinfo==9.0.0
207
+ pyarrow==14.0.2
208
+ pycocotools==2.0.7
209
+ pydantic==2.5.3
210
+ pydantic_core==2.14.6
211
+ pydeck==0.8.1b0
212
+ pyfiglet==0.7
213
+ Pygments==2.17.2
214
+ pymorphy2==0.9.1
215
+ pymorphy2-dicts-ru==2.4.417127.4579844
216
+ pynndescent==0.5.11
217
+ pyparsing==3.1.1
218
+ python-dateutil==2.8.2
219
+ python-dotenv==1.0.1
220
+ python-magic==0.4.27
221
+ python-slugify==8.0.4
222
+ pytz==2023.3.post1
223
+ PyYAML==6.0.1
224
+ pyzmq==25.1.2
225
+ referencing==0.32.1
226
+ regex==2023.12.25
227
+ requests==2.31.0
228
+ requests-toolbelt==1.0.0
229
+ rich==13.7.0
230
+ roboflow==1.1.21
231
+ rpds-py==0.17.1
232
+ schedule==1.2.1
233
+ scikit-base==0.7.2
234
+ scikit-image==0.22.0
235
+ scikit-learn==1.4.0
236
+ scipy==1.12.0
237
+ seaborn==0.13.1
238
+ setuptools==69.0.3
239
+ six==1.16.0
240
+ sktime==0.26.0
241
+ smmap==5.0.1
242
+ soupsieve==2.5
243
+ stack-data==0.6.3
244
+ statsmodels==0.14.1
245
+ streamlit==1.30.0
246
+ supervision==0.18.0
247
+ sweetviz==2.3.1
248
+ sympy==1.12
249
+ tabulate==0.9.0
250
+ tenacity==8.2.3
251
+ tensorboard==2.16.2
252
+ tensorboard-data-server==0.7.2
253
+ tensorflow==2.16.0rc0
254
+ termcolor==2.4.0
255
+ text-unidecode==1.3
256
+ thop==0.1.1.post2209072238
257
+ threadpoolctl==3.2.0
258
+ tifffile==2024.1.30
259
+ tinycss2==1.2.1
260
+ toml==0.10.2
261
+ toolz==0.12.0
262
+ torch==2.2.0
263
+ torchaudio==2.2.0
264
+ torchutils==0.0.4
265
+ torchvision==0.17.0
266
+ tornado==6.4
267
+ tqdm==4.66.1
268
+ traitlets==5.14.1
269
+ twitchio==2.8.2
270
+ typing_extensions==4.9.0
271
+ tzdata==2023.4
272
+ tzlocal==5.2
273
+ ultralytics==8.1.19
274
+ urllib3==2.1.0
275
+ validators==0.22.0
276
+ wcwidth==0.2.13
277
+ webencodings==0.5.1
278
+ Werkzeug==3.0.1
279
+ wheel==0.42.0
280
+ wrapt==1.16.0
281
+ xlrd==2.0.1
282
+ yacs==0.1.8
283
+ yarg==0.1.9
284
+ yarl==1.9.4
285
+ yellowbrick==1.5
286
+ yfinance==0.2.35
287
+ zipp==3.17.0
vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f39240cedb8d7c29f44d666699030a8d088be200a8a7228efc5a2ec90293f6e
3
+ size 3379321