Pietro Lesci commited on
Commit
b748dad
·
1 Parent(s): c718eb8
.streamlit/config.toml CHANGED
@@ -1,4 +1,7 @@
1
  [server]
2
  # Max size, in megabytes, for files uploaded with the file_uploader.
3
  # Default: 200
4
- maxUploadSize = 10
 
 
 
 
1
  [server]
2
  # Max size, in megabytes, for files uploaded with the file_uploader.
3
  # Default: 200
4
+ maxUploadSize = 20
5
+
6
+ [browser]
7
+ gatherUsageStats = false
main.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from src.utils import get_logo, read_file, convert_df
3
+ from src.components import form, faq, presentation, footer, about
4
+
5
+
6
+ # app configs
7
+ st.set_page_config(
8
+ page_title="Wordify",
9
+ initial_sidebar_state="expanded",
10
+ layout="centered",
11
+ page_icon="./assets/logo.png",
12
+ menu_items={
13
+ 'Get Help': "https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new",
14
+ 'Report a Bug': "https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new",
15
+ 'About': about(),
16
+ }
17
+ )
18
+
19
+ # logo
20
+ st.sidebar.image(get_logo("./assets/logo.png"))
21
+
22
+ # title
23
+ st.title("Wordify")
24
+
25
+ # file uploader
26
+ uploaded_fl = st.sidebar.file_uploader(
27
+ label="Choose a file",
28
+ type=["csv", "parquet", "tsv", "xlsx"],
29
+ accept_multiple_files=False,
30
+ help="""
31
+ Supported formats:
32
+ - CSV
33
+ - TSV
34
+ - PARQUET
35
+ - XLSX (do not support [Strict Open XML Spreadsheet format](https://stackoverflow.com/questions/62800822/openpyxl-cannot-read-strict-open-xml-spreadsheet-format-userwarning-file-conta))
36
+ """,
37
+ )
38
+
39
+ if not uploaded_fl:
40
+ presentation()
41
+ faq()
42
+ else:
43
+ df = read_file(uploaded_fl)
44
+ new_df = form(df)
45
+ if new_df is not None:
46
+ payload = convert_df(new_df)
47
+ st.download_button(
48
+ label="Download data as CSV",
49
+ data=payload,
50
+ file_name="wordify_results.csv",
51
+ mime="text/csv",
52
+ )
53
+
54
+
55
+ # footer
56
+ footer()
notebooks/wordifier_nb.ipynb CHANGED
@@ -1,67 +1,589 @@
1
  {
2
- "metadata": {
3
- "language_info": {
4
- "codemirror_mode": {
5
- "name": "ipython",
6
- "version": 3
7
- },
8
- "file_extension": ".py",
9
- "mimetype": "text/x-python",
10
- "name": "python",
11
- "nbconvert_exporter": "python",
12
- "pygments_lexer": "ipython3",
13
- "version": "3.8.3"
14
- },
15
- "orig_nbformat": 2,
16
- "kernelspec": {
17
- "name": "python383jvsc74a57bd01cb9a1c850fd1d16c5b98054247a74b7b7a12849bcfa00436ba202c2a9e2bbb2",
18
- "display_name": "Python 3.8.3 64-bit ('py38': conda)"
19
- }
20
- },
21
- "nbformat": 4,
22
- "nbformat_minor": 2,
23
  "cells": [
24
  {
25
  "cell_type": "code",
26
- "execution_count": 1,
27
  "metadata": {},
28
  "outputs": [],
29
  "source": [
30
  "import sys\n",
31
- "nb_dir = os.path.split(os.getcwd())[0]\n",
32
- "if nb_dir not in sys.path:\n",
33
- " sys.path.append(nb_dir)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  "import numpy as np\n",
36
  "import pandas as pd\n",
37
- "# import modin.pandas as mpd\n",
38
- "import spacy\n",
39
- "from src.configs import ModelConfigs, Languages\n",
40
- "from src.utils import wordifier, TextPreprocessor, encode\n",
41
- "\n",
42
- "from textacy.preprocessing import make_pipeline, remove, replace, normalize\n",
43
- "from tqdm import trange\n",
44
- "from sklearn.feature_extraction.text import TfidfVectorizer\n",
45
  "from sklearn.linear_model import LogisticRegression\n",
46
- "from sklearn.preprocessing import LabelEncoder\n",
47
  "from sklearn.utils import resample\n",
48
- "import multiprocessing as mp\n",
49
- "# import dask.dataframe as dask_df\n",
50
- "from stqdm import stqdm\n",
51
- "stqdm.pandas()\n",
52
  "\n",
53
- "from tqdm import trange\n",
54
  "\n",
55
- "import os\n",
56
- "# os.environ[\"MODIN_ENGINE\"] = \"ray\" # Modin will use Ray\n",
57
  "\n",
58
- "import vaex\n",
59
- "pd.set_option(\"display.max_colwidth\", None)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  ]
61
  },
62
  {
63
  "cell_type": "code",
64
- "execution_count": 4,
65
  "metadata": {},
66
  "outputs": [],
67
  "source": [
@@ -70,7 +592,7 @@
70
  },
71
  {
72
  "cell_type": "code",
73
- "execution_count": 28,
74
  "metadata": {},
75
  "outputs": [],
76
  "source": [
@@ -79,7 +601,7 @@
79
  },
80
  {
81
  "cell_type": "code",
82
- "execution_count": 29,
83
  "metadata": {},
84
  "outputs": [],
85
  "source": [
@@ -91,7 +613,7 @@
91
  },
92
  {
93
  "cell_type": "code",
94
- "execution_count": 30,
95
  "metadata": {},
96
  "outputs": [],
97
  "source": [
@@ -104,24 +626,16 @@
104
  },
105
  {
106
  "cell_type": "code",
107
- "execution_count": 31,
108
  "metadata": {},
109
- "outputs": [
110
- {
111
- "output_type": "stream",
112
- "name": "stderr",
113
- "text": [
114
- "100%|██████████| 9939/9939 [00:06<00:00, 1431.09it/s]\n"
115
- ]
116
- }
117
- ],
118
  "source": [
119
  "df[\"p_text\"] = prep.fit_transform(df[\"text\"])"
120
  ]
121
  },
122
  {
123
  "cell_type": "code",
124
- "execution_count": 32,
125
  "metadata": {},
126
  "outputs": [],
127
  "source": [
@@ -130,7 +644,7 @@
130
  },
131
  {
132
  "cell_type": "code",
133
- "execution_count": 21,
134
  "metadata": {},
135
  "outputs": [],
136
  "source": [
@@ -146,28 +660,9 @@
146
  },
147
  {
148
  "cell_type": "code",
149
- "execution_count": 22,
150
  "metadata": {},
151
- "outputs": [
152
- {
153
- "output_type": "stream",
154
- "name": "stdout",
155
- "text": [
156
- "CPU times: user 1.45 s, sys: 10.6 ms, total: 1.46 s\nWall time: 1.46 s\n"
157
- ]
158
- },
159
- {
160
- "output_type": "execute_result",
161
- "data": {
162
- "text/plain": [
163
- "LogisticRegression(C=0.05, class_weight='balanced', max_iter=500, penalty='l1',\n",
164
- " solver='liblinear')"
165
- ]
166
- },
167
- "metadata": {},
168
- "execution_count": 22
169
- }
170
- ],
171
  "source": [
172
  "%%time\n",
173
  "clf.fit(X, y)"
@@ -182,32 +677,9 @@
182
  },
183
  {
184
  "cell_type": "code",
185
- "execution_count": 14,
186
  "metadata": {},
187
- "outputs": [
188
- {
189
- "output_type": "stream",
190
- "name": "stderr",
191
- "text": [
192
- " 6%|▌ | 28/500 [01:01<27:33, 3.50s/it]/Users/49796/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
193
- " warnings.warn(\"Liblinear failed to converge, increase \"\n",
194
- " 31%|███ | 156/500 [06:18<13:54, 2.43s/it]\n"
195
- ]
196
- },
197
- {
198
- "output_type": "error",
199
- "ename": "KeyboardInterrupt",
200
- "evalue": "",
201
- "traceback": [
202
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
203
- "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
204
- "\u001b[0;32m<ipython-input-14-1fef5b7ccf45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;31m# fit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 41\u001b[0;31m \u001b[0mclf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 42\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
205
- "\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 1354\u001b[0m \u001b[0;34m\" 'solver' is set to 'liblinear'. Got 'n_jobs'\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1355\u001b[0m \" = {}.\".format(effective_n_jobs(self.n_jobs)))\n\u001b[0;32m-> 1356\u001b[0;31m self.coef_, self.intercept_, n_iter_ = _fit_liblinear(\n\u001b[0m\u001b[1;32m 1357\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_intercept\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintercept_scaling\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1358\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclass_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdual\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
206
- "\u001b[0;32m~/miniconda3/envs/py38/lib/python3.8/site-packages/sklearn/svm/_base.py\u001b[0m in \u001b[0;36m_fit_liblinear\u001b[0;34m(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 965\u001b[0m \u001b[0msolver_type\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_liblinear_solver_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmulti_class\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpenalty\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdual\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 966\u001b[0;31m raw_coef_, n_iter_ = liblinear.train_wrap(\n\u001b[0m\u001b[1;32m 967\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_ind\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misspmatrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msolver_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbias\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mC\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[0mclass_weight_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrnd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miinfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'i'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
207
- "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
208
- ]
209
- }
210
- ],
211
  "source": [
212
  "n_instances, n_features = X.shape\n",
213
  "n_classes = len(y_names)\n",
@@ -293,5 +765,30 @@
293
  "outputs": [],
294
  "source": []
295
  }
296
- ]
297
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 65,
6
  "metadata": {},
7
  "outputs": [],
8
  "source": [
9
  "import sys\n",
10
+ "sys.path.insert(0, \"..\")\n",
11
+ "import vaex\n",
12
+ "from vaex.ml import LabelEncoder\n",
13
+ "import spacy\n",
14
+ "import pandas as pd\n",
15
+ "from tqdm import tqdm\n",
16
+ "import os\n",
17
+ "import multiprocessing as mp\n",
18
+ "from src.preprocessing import PreprocessingPipeline, encode\n",
19
+ "from src.wordifier import ModelConfigs\n",
20
+ "from sklearn.pipeline import Pipeline\n",
21
+ "from sklearn.linear_model import LogisticRegression\n",
22
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
23
+ "import numpy as np"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 67,
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "pipe = PreprocessingPipeline(\n",
33
+ " language=\"English\",\n",
34
+ " pre_steps=list(PreprocessingPipeline.pipeline_components().keys()),\n",
35
+ " lemmatization_step=list(PreprocessingPipeline.lemmatization_component().keys())[1],\n",
36
+ " post_steps=list(PreprocessingPipeline.pipeline_components().keys()),\n",
37
+ ")"
38
+ ]
39
+ },
40
+ {
41
+ "cell_type": "code",
42
+ "execution_count": 68,
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "def fn(t):\n",
47
+ " return pipe.post(pipe.lemma(pipe.nlp(pipe.pre(t))))"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 69,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "vdf = vaex.from_pandas(df)\n",
57
+ "vdf[\"processed_text\"] = vdf.apply(fn, arguments=[vdf[\"text\"]], vectorize=False)\n",
58
+ "df = vdf.to_pandas_df()"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 71,
64
+ "metadata": {},
65
+ "outputs": [
66
+ {
67
+ "name": "stderr",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "2021-11-28 17:01:36.883 \n",
71
+ " \u001b[33m\u001b[1mWarning:\u001b[0m to view this Streamlit app on a browser, run it with the following\n",
72
+ " command:\n",
73
+ "\n",
74
+ " streamlit run /Users/pietrolesci/miniconda3/envs/wordify/lib/python3.7/site-packages/ipykernel_launcher.py [ARGUMENTS]\n"
75
+ ]
76
+ }
77
+ ],
78
+ "source": [
79
+ "import streamlit as st\n",
80
+ "pbar = st.progress(0)\n",
81
+ "N = 100\n",
82
+ "for i, _ in enumerate(range(N)):\n",
83
+ " if i % N == 0:\n",
84
+ " pbar.progress(1)"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": null,
90
+ "metadata": {},
91
+ "outputs": [],
92
+ "source": []
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 24,
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "configs = ModelConfigs\n",
101
+ "clf = Pipeline(\n",
102
+ " [\n",
103
+ " (\"tfidf\", TfidfVectorizer()),\n",
104
+ " (\n",
105
+ " \"classifier\",\n",
106
+ " LogisticRegression(\n",
107
+ " penalty=\"l1\",\n",
108
+ " C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],\n",
109
+ " solver=\"liblinear\",\n",
110
+ " multi_class=\"auto\",\n",
111
+ " max_iter=500,\n",
112
+ " class_weight=\"balanced\",\n",
113
+ " ),\n",
114
+ " ),\n",
115
+ " ]\n",
116
+ ")\n"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 29,
122
+ "metadata": {},
123
+ "outputs": [
124
+ {
125
+ "data": {
126
+ "text/plain": [
127
+ "Pipeline(steps=[('tfidf', TfidfVectorizer()),\n",
128
+ " ('classifier',\n",
129
+ " LogisticRegression(C=1, class_weight='balanced', max_iter=500,\n",
130
+ " penalty='l1', solver='liblinear'))])"
131
+ ]
132
+ },
133
+ "execution_count": 29,
134
+ "metadata": {},
135
+ "output_type": "execute_result"
136
+ }
137
+ ],
138
+ "source": [
139
+ "clf.fit(df[\"text\"], df[\"label\"])"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 39,
145
+ "metadata": {},
146
+ "outputs": [
147
+ {
148
+ "data": {
149
+ "text/plain": [
150
+ "array(['00', '000', '00001', ..., 'ís', 'über', 'überwoman'], dtype=object)"
151
+ ]
152
+ },
153
+ "execution_count": 39,
154
+ "metadata": {},
155
+ "output_type": "execute_result"
156
+ }
157
+ ],
158
+ "source": []
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 40,
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "def wordifier(df, text_col, label_col, configs=ModelConfigs):\n",
167
+ "\n",
168
+ " n_instances, n_features = X.shape\n",
169
+ " n_classes = np.unique(y)\n",
170
+ "\n",
171
+ " # NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n",
172
+ " sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n",
173
+ "\n",
174
+ " sample_size = min(\n",
175
+ " # this is the maximum supported\n",
176
+ " configs.MAX_SELECTION.value,\n",
177
+ " # at minimum you want MIN_SELECTION but in general you want\n",
178
+ " # n_instances * sample_fraction\n",
179
+ " max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n",
180
+ " # however if previous one is bigger the the available instances take\n",
181
+ " # the number of available instances\n",
182
+ " n_instances,\n",
183
+ " )\n",
184
+ "\n",
185
+ " # TODO: might want to try out something to subsample features at each iteration\n",
186
+ "\n",
187
+ " # initialize coefficient matrices\n",
188
+ " pos_scores = np.zeros((n_classes, n_features), dtype=int)\n",
189
+ " neg_scores = np.zeros((n_classes, n_features), dtype=int)\n",
190
+ "\n",
191
+ " for _ in range(configs.NUM_ITERS.value):\n",
192
+ "\n",
193
+ " # run randomized regression\n",
194
+ " clf = Pipeline([\n",
195
+ " ('tfidf', TfidfVectorizer()), \n",
196
+ " ('classifier', LogisticRegression(\n",
197
+ " penalty=\"l1\",\n",
198
+ " C=configs.PENALTIES.value[\n",
199
+ " np.random.randint(len(configs.PENALTIES.value))\n",
200
+ " ],\n",
201
+ " solver=\"liblinear\",\n",
202
+ " multi_class=\"auto\",\n",
203
+ " max_iter=500,\n",
204
+ " class_weight=\"balanced\",\n",
205
+ " ))]\n",
206
+ " )\n",
207
+ "\n",
208
+ " # sample indices to subsample matrix\n",
209
+ " selection = resample(\n",
210
+ " np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size\n",
211
+ " )\n",
212
+ "\n",
213
+ " # fit\n",
214
+ " try:\n",
215
+ " clf.fit(X[selection], y[selection])\n",
216
+ " except ValueError:\n",
217
+ " continue\n",
218
+ "\n",
219
+ " # record coefficients\n",
220
+ " if n_classes == 2:\n",
221
+ " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n",
222
+ " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n",
223
+ " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n",
224
+ " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n",
225
+ " else:\n",
226
+ " pos_scores += clf.coef_ > 0\n",
227
+ " neg_scores += clf.coef_ < 0\n",
228
+ "\n",
229
+ "\n",
230
+ " # normalize\n",
231
+ " pos_scores = pos_scores / configs.NUM_ITERS.value\n",
232
+ " neg_scores = neg_scores / configs.NUM_ITERS.value\n",
233
+ "\n",
234
+ " # get only active features\n",
235
+ " pos_positions = np.where(\n",
236
+ " pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0\n",
237
+ " )\n",
238
+ " neg_positions = np.where(\n",
239
+ " neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0\n",
240
+ " )\n",
241
+ "\n",
242
+ " # prepare DataFrame\n",
243
+ " X_names = clf.steps[0][1].get_feature_names_out()\n",
244
+ " pos = [\n",
245
+ " (X_names[i], pos_scores[c, i], y_names[c])\n",
246
+ " for c, i in zip(*pos_positions.nonzero())\n",
247
+ " ]\n",
248
+ " neg = [\n",
249
+ " (X_names[i], neg_scores[c, i], y_names[c])\n",
250
+ " for c, i in zip(*neg_positions.nonzero())\n",
251
+ " ]\n",
252
+ "\n",
253
+ " posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values(\n",
254
+ " [\"label\", \"score\"], ascending=False\n",
255
+ " )\n",
256
+ " negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values(\n",
257
+ " [\"label\", \"score\"], ascending=False\n",
258
+ " )\n",
259
+ "\n",
260
+ " return posdf, negdf"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 41,
266
+ "metadata": {},
267
+ "outputs": [],
268
+ "source": [
269
+ "res = vdf.apply(wordifier, arguments=[vdf.processed_text, vdf.encoded_label], vectorize=False)"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 45,
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": [
278
+ "from vaex.ml.sklearn import Predictor"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 60,
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "clf = Pipeline(\n",
288
+ " [\n",
289
+ " (\n",
290
+ " \"tfidf\",\n",
291
+ " TfidfVectorizer(\n",
292
+ " input=\"content\", # default: file already in memory\n",
293
+ " encoding=\"utf-8\", # default\n",
294
+ " decode_error=\"strict\", # default\n",
295
+ " strip_accents=None, # do nothing\n",
296
+ " lowercase=False, # do nothing\n",
297
+ " preprocessor=None, # do nothing - default\n",
298
+ " tokenizer=None, # default\n",
299
+ " stop_words=None, # do nothing\n",
300
+ " analyzer=\"word\",\n",
301
+ " ngram_range=(1, 3), # maximum 3-ngrams\n",
302
+ " min_df=0.001,\n",
303
+ " max_df=0.75,\n",
304
+ " sublinear_tf=True,\n",
305
+ " ),\n",
306
+ " ),\n",
307
+ " (\n",
308
+ " \"classifier\",\n",
309
+ " LogisticRegression(\n",
310
+ " penalty=\"l1\",\n",
311
+ " C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],\n",
312
+ " solver=\"liblinear\",\n",
313
+ " multi_class=\"auto\",\n",
314
+ " max_iter=500,\n",
315
+ " class_weight=\"balanced\",\n",
316
+ " ),\n",
317
+ " ),\n",
318
+ " ]\n",
319
+ ")\n",
320
  "\n",
321
+ "vaex_model = Predictor(\n",
322
+ " features=[\"processed_text\"],\n",
323
+ " target=\"encoded_label\",\n",
324
+ " model=clf,\n",
325
+ " prediction_name=\"prediction\",\n",
326
+ ")\n"
327
+ ]
328
+ },
329
+ {
330
+ "cell_type": "code",
331
+ "execution_count": 61,
332
+ "metadata": {},
333
+ "outputs": [
334
+ {
335
+ "ename": "TypeError",
336
+ "evalue": "unhashable type: 'list'",
337
+ "output_type": "error",
338
+ "traceback": [
339
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
340
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
341
+ "\u001b[0;32m/var/folders/b_/m81mmt0s6gv48kdvk44n2l740000gn/T/ipykernel_52217/687453386.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvaex_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
342
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/ml/sklearn.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, df, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m '''\n\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
343
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mvalues\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 6897\u001b[0m \u001b[0mIf\u001b[0m \u001b[0many\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0mcontain\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0mare\u001b[0m \u001b[0mignored\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0melements\u001b[0m \u001b[0mare\u001b[0m \u001b[0mreturned\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mwell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6898\u001b[0m \"\"\"\n\u001b[0;32m-> 6899\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__array__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
344
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m__array__\u001b[0;34m(self, dtype, parallel)\u001b[0m\n\u001b[1;32m 5989\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolumn_type\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5990\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot cast %r (of type %r) to %r\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5991\u001b[0;31m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'numpy'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5992\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misMaskedArray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5993\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
345
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, progress)\u001b[0m\n\u001b[1;32m 2962\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2963\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2964\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_evaluate_implementation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2965\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2966\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mdocsubst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
346
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m_evaluate_implementation\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, raw, progress)\u001b[0m\n\u001b[1;32m 6207\u001b[0m \u001b[0;31m# TODO: For NEP branch: dtype -> dtype_evaluate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6209\u001b[0;31m \u001b[0mexpression_to_evaluate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# lets assume we have to do them all\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6211\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mexpression\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
347
+ "\u001b[0;31mTypeError\u001b[0m: unhashable type: 'list'"
348
+ ]
349
+ }
350
+ ],
351
+ "source": [
352
+ "vaex_model.fit(vdf)"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "execution_count": null,
358
+ "metadata": {},
359
+ "outputs": [],
360
+ "source": []
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": 52,
365
+ "metadata": {},
366
+ "outputs": [
367
+ {
368
+ "data": {
369
+ "text/plain": [
370
+ "b'\\x80\\x03c__main__\\nwordifier\\nq\\x00.'"
371
+ ]
372
+ },
373
+ "execution_count": 52,
374
+ "metadata": {},
375
+ "output_type": "execute_result"
376
+ }
377
+ ],
378
+ "source": [
379
+ "import pickle\n",
380
+ "pickle.dumps(wordifier)"
381
+ ]
382
+ },
383
+ {
384
+ "cell_type": "code",
385
+ "execution_count": 47,
386
+ "metadata": {},
387
+ "outputs": [
388
+ {
389
+ "ename": "TypeError",
390
+ "evalue": "unhashable type: 'list'",
391
+ "output_type": "error",
392
+ "traceback": [
393
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
394
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
395
+ "\u001b[0;32m/var/folders/b_/m81mmt0s6gv48kdvk44n2l740000gn/T/ipykernel_52217/687453386.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvaex_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
396
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/ml/sklearn.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, df, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m '''\n\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
397
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mvalues\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 6897\u001b[0m \u001b[0mIf\u001b[0m \u001b[0many\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0mcontain\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0mare\u001b[0m \u001b[0mignored\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0melements\u001b[0m \u001b[0mare\u001b[0m \u001b[0mreturned\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mwell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6898\u001b[0m \"\"\"\n\u001b[0;32m-> 6899\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__array__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
398
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m__array__\u001b[0;34m(self, dtype, parallel)\u001b[0m\n\u001b[1;32m 5989\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolumn_type\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5990\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot cast %r (of type %r) to %r\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5991\u001b[0;31m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'numpy'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5992\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misMaskedArray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5993\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
399
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, progress)\u001b[0m\n\u001b[1;32m 2962\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2963\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2964\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_evaluate_implementation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2965\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2966\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mdocsubst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
400
+ "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m_evaluate_implementation\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, raw, progress)\u001b[0m\n\u001b[1;32m 6207\u001b[0m \u001b[0;31m# TODO: For NEP branch: dtype -> dtype_evaluate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6209\u001b[0;31m \u001b[0mexpression_to_evaluate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# lets assume we have to do them all\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6211\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mexpression\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
401
+ "\u001b[0;31mTypeError\u001b[0m: unhashable type: 'list'"
402
+ ]
403
+ }
404
+ ],
405
+ "source": []
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": null,
410
+ "metadata": {},
411
+ "outputs": [],
412
+ "source": []
413
+ },
414
+ {
415
+ "cell_type": "code",
416
+ "execution_count": null,
417
+ "metadata": {},
418
+ "outputs": [],
419
+ "source": [
420
+ "res = []\n",
421
+ "with tqdm(total=len(df)) as pbar:\n",
422
+ " for doc in tqdm(nlp.pipe(df[\"text\"].values, batch_size=500, n_process=n_cpus)):\n",
423
+ " res.append([i.lemma_ for i in doc])\n",
424
+ " pbar.update(1)"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": null,
430
+ "metadata": {},
431
+ "outputs": [],
432
+ "source": [
433
+ "import pickle"
434
+ ]
435
+ },
436
+ {
437
+ "cell_type": "code",
438
+ "execution_count": null,
439
+ "metadata": {},
440
+ "outputs": [],
441
+ "source": [
442
+ "def fn(t):\n",
443
+ " return "
444
+ ]
445
+ },
446
+ {
447
+ "cell_type": "code",
448
+ "execution_count": null,
449
+ "metadata": {},
450
+ "outputs": [],
451
+ "source": [
452
+ "%%timeit\n",
453
+ "with mp.Pool(mp.cpu_count()) as pool:\n",
454
+ " new_s = pool.map(nlp, df[\"text\"].values)"
455
+ ]
456
+ },
457
+ {
458
+ "cell_type": "code",
459
+ "execution_count": null,
460
+ "metadata": {},
461
+ "outputs": [],
462
+ "source": []
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": null,
467
+ "metadata": {},
468
+ "outputs": [],
469
+ "source": []
470
+ },
471
+ {
472
+ "cell_type": "code",
473
+ "execution_count": null,
474
+ "metadata": {},
475
+ "outputs": [],
476
+ "source": [
477
+ "from typing import List\n",
478
  "import numpy as np\n",
479
  "import pandas as pd\n",
480
+ "import streamlit as st\n",
 
 
 
 
 
 
 
481
  "from sklearn.linear_model import LogisticRegression\n",
 
482
  "from sklearn.utils import resample\n",
 
 
 
 
483
  "\n",
484
+ "from src.configs import ModelConfigs\n",
485
  "\n",
 
 
486
  "\n",
487
+ "def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):\n",
488
+ "\n",
489
+ " n_instances, n_features = X.shape\n",
490
+ " n_classes = len(y_names)\n",
491
+ "\n",
492
+ " # NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n",
493
+ " sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n",
494
+ "\n",
495
+ " sample_size = min(\n",
496
+ " # this is the maximum supported\n",
497
+ " configs.MAX_SELECTION.value,\n",
498
+ " # at minimum you want MIN_SELECTION but in general you want\n",
499
+ " # n_instances * sample_fraction\n",
500
+ " max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n",
501
+ " # however if previous one is bigger the the available instances take\n",
502
+ " # the number of available instances\n",
503
+ " n_instances,\n",
504
+ " )\n",
505
+ "\n",
506
+ " # TODO: might want to try out something to subsample features at each iteration\n",
507
+ "\n",
508
+ " # initialize coefficient matrices\n",
509
+ " pos_scores = np.zeros((n_classes, n_features), dtype=int)\n",
510
+ " neg_scores = np.zeros((n_classes, n_features), dtype=int)\n",
511
+ "\n",
512
+ " with st.spinner(\"Wordifying!\"):\n",
513
+ " pbar = st.progress(0)\n",
514
+ "\n",
515
+ " for i, _ in enumerate(range(configs.NUM_ITERS.value)):\n",
516
+ "\n",
517
+ " # run randomized regression\n",
518
+ " clf = LogisticRegression(\n",
519
+ " penalty=\"l1\",\n",
520
+ " C=configs.PENALTIES.value[\n",
521
+ " np.random.randint(len(configs.PENALTIES.value))\n",
522
+ " ],\n",
523
+ " solver=\"liblinear\",\n",
524
+ " multi_class=\"auto\",\n",
525
+ " max_iter=500,\n",
526
+ " class_weight=\"balanced\",\n",
527
+ " )\n",
528
+ "\n",
529
+ " # sample indices to subsample matrix\n",
530
+ " selection = resample(\n",
531
+ " np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size\n",
532
+ " )\n",
533
+ "\n",
534
+ " # fit\n",
535
+ " try:\n",
536
+ " clf.fit(X[selection], y[selection])\n",
537
+ " except ValueError:\n",
538
+ " continue\n",
539
+ "\n",
540
+ " # record coefficients\n",
541
+ " if n_classes == 2:\n",
542
+ " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n",
543
+ " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n",
544
+ " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n",
545
+ " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n",
546
+ " else:\n",
547
+ " pos_scores += clf.coef_ > 0\n",
548
+ " neg_scores += clf.coef_ < 0\n",
549
+ "\n",
550
+ " pbar.progress(i + 1)\n",
551
+ "\n",
552
+ " # normalize\n",
553
+ " pos_scores = pos_scores / configs.NUM_ITERS.value\n",
554
+ " neg_scores = neg_scores / configs.NUM_ITERS.value\n",
555
+ "\n",
556
+ " # get only active features\n",
557
+ " pos_positions = np.where(\n",
558
+ " pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0\n",
559
+ " )\n",
560
+ " neg_positions = np.where(\n",
561
+ " neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0\n",
562
+ " )\n",
563
+ "\n",
564
+ " # prepare DataFrame\n",
565
+ " pos = [\n",
566
+ " (X_names[i], pos_scores[c, i], y_names[c])\n",
567
+ " for c, i in zip(*pos_positions.nonzero())\n",
568
+ " ]\n",
569
+ " neg = [\n",
570
+ " (X_names[i], neg_scores[c, i], y_names[c])\n",
571
+ " for c, i in zip(*neg_positions.nonzero())\n",
572
+ " ]\n",
573
+ "\n",
574
+ " posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values(\n",
575
+ " [\"label\", \"score\"], ascending=False\n",
576
+ " )\n",
577
+ " negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values(\n",
578
+ " [\"label\", \"score\"], ascending=False\n",
579
+ " )\n",
580
+ "\n",
581
+ " return posdf, negdf\n"
582
  ]
583
  },
584
  {
585
  "cell_type": "code",
586
+ "execution_count": null,
587
  "metadata": {},
588
  "outputs": [],
589
  "source": [
 
592
  },
593
  {
594
  "cell_type": "code",
595
+ "execution_count": null,
596
  "metadata": {},
597
  "outputs": [],
598
  "source": [
 
601
  },
602
  {
603
  "cell_type": "code",
604
+ "execution_count": null,
605
  "metadata": {},
606
  "outputs": [],
607
  "source": [
 
613
  },
614
  {
615
  "cell_type": "code",
616
+ "execution_count": null,
617
  "metadata": {},
618
  "outputs": [],
619
  "source": [
 
626
  },
627
  {
628
  "cell_type": "code",
629
+ "execution_count": null,
630
  "metadata": {},
631
+ "outputs": [],
 
 
 
 
 
 
 
 
632
  "source": [
633
  "df[\"p_text\"] = prep.fit_transform(df[\"text\"])"
634
  ]
635
  },
636
  {
637
  "cell_type": "code",
638
+ "execution_count": null,
639
  "metadata": {},
640
  "outputs": [],
641
  "source": [
 
644
  },
645
  {
646
  "cell_type": "code",
647
+ "execution_count": null,
648
  "metadata": {},
649
  "outputs": [],
650
  "source": [
 
660
  },
661
  {
662
  "cell_type": "code",
663
+ "execution_count": null,
664
  "metadata": {},
665
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
666
  "source": [
667
  "%%time\n",
668
  "clf.fit(X, y)"
 
677
  },
678
  {
679
  "cell_type": "code",
680
+ "execution_count": null,
681
  "metadata": {},
682
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
  "source": [
684
  "n_instances, n_features = X.shape\n",
685
  "n_classes = len(y_names)\n",
 
765
  "outputs": [],
766
  "source": []
767
  }
768
+ ],
769
+ "metadata": {
770
+ "interpreter": {
771
+ "hash": "aa7efd0b3ada76bb0689aa8ed0b61d7de788847e3d11d2d142fc5800c765982f"
772
+ },
773
+ "kernelspec": {
774
+ "display_name": "Python 3.8.3 64-bit ('py38': conda)",
775
+ "language": "python",
776
+ "name": "python3"
777
+ },
778
+ "language_info": {
779
+ "codemirror_mode": {
780
+ "name": "ipython",
781
+ "version": 3
782
+ },
783
+ "file_extension": ".py",
784
+ "mimetype": "text/x-python",
785
+ "name": "python",
786
+ "nbconvert_exporter": "python",
787
+ "pygments_lexer": "ipython3",
788
+ "version": "3.7.11"
789
+ },
790
+ "orig_nbformat": 2
791
+ },
792
+ "nbformat": 4,
793
+ "nbformat_minor": 2
794
+ }
src/components.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from src.preprocessing import PreprocessingPipeline
3
+ from src.wordifier import input_transform, wordifier, output_transform
4
+ from src.configs import PreprocessingConfigs, SupportedFiles, Languages
5
+
6
+
7
+ @st.experimental_memo
8
+ def form(df):
9
+ with st.form("my_form"):
10
+ col1, col2 = st.columns([1, 2])
11
+ with col1:
12
+
13
+ cols = [""] + df.columns.tolist()
14
+ label_column = st.selectbox(
15
+ "Select label column", cols, index=0, help="Select the column containing the labels"
16
+ )
17
+ text_column = st.selectbox(
18
+ "Select text column", cols, index=0, help="Select the column containing the text"
19
+ )
20
+ language = st.selectbox(
21
+ "Select language",
22
+ [i.name for i in Languages],
23
+ help="""
24
+ Select the language of your texts amongst the supported one. If we currently do
25
+ not support it, feel free to open an issue
26
+ """,
27
+ )
28
+
29
+ with col2:
30
+ steps_options = list(PreprocessingPipeline.pipeline_components().keys())
31
+ pre_steps = st.multiselect(
32
+ "Select pre-lemmatization processing steps (ordered)",
33
+ options=steps_options,
34
+ default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value],
35
+ format_func=lambda x: x.replace("_", " ").title(),
36
+ help="Select the processing steps to apply before the text is lemmatized",
37
+ )
38
+
39
+ lammatization_options = list(PreprocessingPipeline.lemmatization_component().keys())
40
+ lemmatization_step = st.selectbox(
41
+ "Select lemmatization",
42
+ options=lammatization_options,
43
+ index=PreprocessingConfigs.DEFAULT_LEMMA.value,
44
+ help="Select lemmatization procedure",
45
+ )
46
+
47
+ post_steps = st.multiselect(
48
+ "Select post-lemmatization processing steps (ordered)",
49
+ options=steps_options,
50
+ default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value],
51
+ format_func=lambda x: x.replace("_", " ").title(),
52
+ help="Select the processing steps to apply after the text is lemmatized",
53
+ )
54
+
55
+ # Every form must have a submit button.
56
+ submitted = st.form_submit_button("Submit")
57
+ if submitted:
58
+
59
+ # preprocess
60
+ with st.spinner("Step 1/4: Preprocessing text"):
61
+ pipe = PreprocessingPipeline(language, pre_steps, lemmatization_step, post_steps)
62
+ df = pipe.vaex_process(df, text_column)
63
+
64
+ # prepare input
65
+ with st.spinner("Step 2/4: Preparing inputs"):
66
+ input_dict = input_transform(df[text_column], df[label_column])
67
+
68
+ # wordify
69
+ with st.spinner("Step 3/4: Wordifying"):
70
+ pos, neg = wordifier(**input_dict)
71
+
72
+ # prepare output
73
+ with st.spinner("Step 4/4: Preparing outputs"):
74
+ new_df = output_transform(pos, neg)
75
+
76
+ return new_df
77
+
78
+
79
+ def faq():
80
+ st.subheader("Frequently Asked Questions")
81
+ with st.expander("What is Wordify?"):
82
+ st.markdown(
83
+ """
84
+ __Wordify__ is a way to find out which n-grams (i.e., words and concatenations of words) are most indicative for each of your dependent
85
+ variable values.
86
+ """
87
+ )
88
+
89
+ with st.expander("What happens to my data?"):
90
+ st.markdown(
91
+ """
92
+ Nothing. We never store the data you upload on disk: it is only kept in memory for the
93
+ duration of the modeling, and then deleted. We do not retain any copies or traces of
94
+ your data.
95
+ """
96
+ )
97
+
98
+ with st.expander("What input formats do you support?"):
99
+ st.markdown(
100
+ f"""
101
+ We currently support {", ".join([i.name for i in SupportedFiles])}.
102
+ """
103
+ )
104
+
105
+ with st.expander("What languages are supported?"):
106
+ st.markdown(
107
+ f"""
108
+ Currently we support: {", ".join([i.name for i in Languages])}.
109
+ """
110
+ )
111
+
112
+ with st.expander("How does it work?"):
113
+ st.markdown(
114
+ """
115
+ It uses a variant of the Stability Selection algorithm
116
+ [(Meinshausen and Bühlmann, 2010)](https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2010.00740.x)
117
+ to fit hundreds of logistic regression models on random subsets of the data, using
118
+ different L1 penalties to drive as many of the term coefficients to 0. Any terms that
119
+ receive a non-zero coefficient in at least 30% of all model runs can be seen as stable
120
+ indicators.
121
+ """
122
+ )
123
+
124
+ with st.expander("What libraries do you use?"):
125
+ st.markdown(
126
+ """
127
+ We leverage the power of many great libraries in the Python ecosystem:
128
+ - `Streamlit`
129
+ - `Pandas`
130
+ - `Numpy`
131
+ - `Spacy`
132
+ - `Scikit-learn`
133
+ - `Vaex`
134
+ """
135
+ )
136
+
137
+ with st.expander("How much data do I need?"):
138
+ st.markdown(
139
+ """
140
+ We recommend at least 2000 instances, the more, the better. With fewer instances, the
141
+ results are less replicable and reliable.
142
+ """
143
+ )
144
+
145
+ with st.expander("Is there a paper I can cite?"):
146
+ st.markdown(
147
+ """
148
+ Yes, please! Cite [Wordify: A Tool for Discovering and Differentiating Consumer Vocabularies](https://academic.oup.com/jcr/article/48/3/394/6199426)
149
+ ```
150
+ @article{10.1093/jcr/ucab018,
151
+ author = {Hovy, Dirk and Melumad, Shiri and Inman, J Jeffrey},
152
+ title = "{Wordify: A Tool for Discovering and Differentiating Consumer Vocabularies}",
153
+ journal = {Journal of Consumer Research},
154
+ volume = {48},
155
+ number = {3},
156
+ pages = {394-414},
157
+ year = {2021},
158
+ month = {03},
159
+ abstract = "{This work describes and illustrates a free and easy-to-use online text-analysis tool for understanding how consumer word use varies across contexts. The tool, Wordify, uses randomized logistic regression (RLR) to identify the words that best discriminate texts drawn from different pre-classified corpora, such as posts written by men versus women, or texts containing mostly negative versus positive valence. We present illustrative examples to show how the tool can be used for such diverse purposes as (1) uncovering the distinctive vocabularies that consumers use when writing reviews on smartphones versus PCs, (2) discovering how the words used in Tweets differ between presumed supporters and opponents of a controversial ad, and (3) expanding the dictionaries of dictionary-based sentiment-measurement tools. We show empirically that Wordify’s RLR algorithm performs better at discriminating vocabularies than support vector machines and chi-square selectors, while offering significant advantages in computing time. A discussion is also provided on the use of Wordify in conjunction with other text-analysis tools, such as probabilistic topic modeling and sentiment analysis, to gain more profound knowledge of the role of language in consumer behavior.}",
160
+ issn = {0093-5301},
161
+ doi = {10.1093/jcr/ucab018},
162
+ url = {https://doi.org/10.1093/jcr/ucab018},
163
+ eprint = {https://academic.oup.com/jcr/article-pdf/48/3/394/40853499/ucab018.pdf},
164
+ }
165
+ ```
166
+ """
167
+ )
168
+
169
+ with st.expander("How can I reach out to the Wordify team?"):
170
+ st.markdown(contacts(), unsafe_allow_html=True)
171
+
172
+
173
+ def presentation():
174
+ st.markdown(
175
+ """
176
+ Wordify makes it easy to identify words that discriminate categories in textual data.
177
+
178
+ :point_left: Start by uploading a file. *Once you upload the file, __Wordify__ will
179
+ show an interactive UI*.
180
+ """
181
+ )
182
+
183
+ st.subheader("Input format")
184
+ st.markdown(
185
+ """
186
+ Please note that your file must have a column with the texts and a column with the labels,
187
+ for example
188
+ """
189
+ )
190
+ st.table(
191
+ {"text": ["A review", "Another review", "Yet another one", "etc"], "label": ["Good", "Bad", "Good", "etc"]}
192
+ )
193
+
194
+ st.subheader("Output format")
195
+ st.markdown(
196
+ """
197
+ As a result of the process, you will get a file containing 4 columns:
198
+ - `Word`: the n-gram (i.e., a word or a concatenation of words) considered
199
+ - `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
200
+ - `Label`: the label that `Word` is discriminating
201
+ - `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
202
+ """
203
+ )
204
+
205
+
206
+ def footer():
207
+ st.sidebar.markdown(
208
+ """
209
+ <span style="font-size: 0.75em">Built with &hearts; by [`Pietro Lesci`](https://pietrolesci.github.io/) and [`MilaNLP`](https://twitter.com/MilaNLProc?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor).</span>
210
+ """,
211
+ unsafe_allow_html=True,
212
+ )
213
+
214
+
215
+ def contacts():
216
+ return """
217
+ You can reach out to us via email, phone, or via mail
218
+
219
+ - :email: [email protected]
220
+
221
+ - :telephone_receiver: +39 02 5836 2604
222
+
223
+ - :postbox: Via Röntgen n. 1, Milan 20136 (ITALY)
224
+
225
+
226
+ <iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
227
+ """
228
+
229
+ def about():
230
+ return """
231
+ The wordify team
232
+ """
src/configs.py CHANGED
@@ -10,6 +10,19 @@ class ModelConfigs(Enum):
10
  MIN_SELECTION = 10_000
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  class Languages(Enum):
14
  English = "en_core_web_sm"
15
  Italian = "it_core_news_sm"
 
10
  MIN_SELECTION = 10_000
11
 
12
 
13
+ class InputTransformConfigs(Enum):
14
+ NGRAM_RANGE = (1, 3)
15
+ MIN_DF = 0.001
16
+ MAX_DF = 0.75
17
+ SUBLINEAR = True
18
+
19
+
20
+ class PreprocessingConfigs(Enum):
21
+ DEFAULT_PRE = [1, 3, 5, 15, 21, 22, 18, 19, 0, 20, -1]
22
+ DEFAULT_LEMMA = 1
23
+ DEFAULT_POST = [20, -1]
24
+
25
+
26
  class Languages(Enum):
27
  English = "en_core_web_sm"
28
  Italian = "it_core_news_sm"
src/preprocessing.py CHANGED
@@ -1,56 +1,20 @@
 
 
1
  import re
2
  import string
3
  from collections import OrderedDict
4
- from typing import Callable, List, Optional, Tuple
5
 
6
- import numpy as np
7
  import pandas as pd
 
8
  import spacy
9
  import streamlit as st
 
10
  from pandas.core.series import Series
11
- from sklearn.feature_extraction.text import TfidfVectorizer
12
- from sklearn.preprocessing import LabelEncoder
13
- from stqdm import stqdm
14
  from textacy.preprocessing import make_pipeline, normalize, remove, replace
15
 
16
  from .configs import Languages
17
 
18
- stqdm.pandas()
19
-
20
-
21
- def encode(text: pd.Series, labels: pd.Series):
22
- """
23
- Encodes text in mathematical object ameanable to training algorithm
24
- """
25
- tfidf_vectorizer = TfidfVectorizer(
26
- input="content", # default: file already in memory
27
- encoding="utf-8", # default
28
- decode_error="strict", # default
29
- strip_accents=None, # do nothing
30
- lowercase=False, # do nothing
31
- preprocessor=None, # do nothing - default
32
- tokenizer=None, # default
33
- stop_words=None, # do nothing
34
- analyzer="word",
35
- ngram_range=(1, 3), # maximum 3-ngrams
36
- min_df=0.001,
37
- max_df=0.75,
38
- sublinear_tf=True,
39
- )
40
- label_encoder = LabelEncoder()
41
-
42
- with st.spinner("Encoding text using TF-IDF and Encoding labels"):
43
- X = tfidf_vectorizer.fit_transform(text.values)
44
- y = label_encoder.fit_transform(labels.values)
45
-
46
- return {
47
- "X": X,
48
- "y": y,
49
- "X_names": np.array(tfidf_vectorizer.get_feature_names()),
50
- "y_names": label_encoder.classes_,
51
- }
52
-
53
-
54
  # more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
55
  # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
56
  # fmt: off
@@ -87,118 +51,101 @@ def normalize_repeating_words(t):
87
  return _re_wrep.sub(_replace_wrep, t)
88
 
89
 
90
- # fmt: on
91
- class Lemmatizer:
92
- """Creates lemmatizer based on spacy"""
93
 
94
- def __init__(
95
- self, language: str, remove_stop: bool = True, lemmatization: bool = True
96
- ) -> None:
97
- self.language = language
98
- self.nlp = spacy.load(
99
- Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
100
- )
101
- self._lemmatizer_fn = self._get_lemmatization_fn(remove_stop, lemmatization)
102
- self.lemmatization = lemmatization
103
 
104
- def _get_lemmatization_fn(
105
- self, remove_stop: bool, lemmatization: bool
106
- ) -> Optional[Callable]:
107
- """Return the correct spacy Doc-level lemmatizer"""
108
- if remove_stop and lemmatization:
109
 
110
- def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
111
- return " ".join(
112
- [t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop]
113
- )
114
 
115
- elif remove_stop and not lemmatization:
 
 
 
116
 
117
- def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
118
- return " ".join([t.text for t in doc if not t.is_stop])
119
 
120
- elif lemmatization and not remove_stop:
 
121
 
122
- def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
123
- return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
124
 
125
- else:
126
- self.status = False
127
- return
128
-
129
- return lemmatizer_fn
130
-
131
- def __call__(self, series: Series) -> Series:
132
- """
133
- Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
134
- """
135
- res = []
136
- pbar = stqdm(total=len(series), desc="Lemmatizing")
137
- for doc in self.nlp.pipe(series, batch_size=500):
138
- res.append(self._lemmatizer_fn(doc))
139
- pbar.update(1)
140
- pbar.close()
141
- return pd.Series(res)
142
 
143
 
 
144
  class PreprocessingPipeline:
145
  def __init__(
146
- self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]
 
 
 
 
147
  ):
 
 
 
 
148
 
149
- # build pipeline
150
- self.pre_pipeline, self.lemmatizer, self.post_pipeline = self.make_pipeline(
151
- pre_steps, lemmatizer, post_steps
152
- )
153
 
154
- def __call__(self, series: Series) -> Series:
155
- with st.spinner("Pre-lemmatization cleaning"):
156
- res = series.progress_map(self.pre_pipeline)
 
 
157
 
158
- with st.spinner("Lemmatizing"):
159
- res = self.lemmatizer(series)
 
160
 
161
- with st.spinner("Post-lemmatization cleaning"):
162
- res = series.progress_map(self.post_pipeline)
163
 
164
- return res
165
 
166
- def make_pipeline(
167
- self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]
168
- ) -> Tuple[Callable]:
 
 
 
 
 
 
 
169
 
170
- # pre-lemmatization steps
171
- pre_steps = [
172
- self.pipeline_components()[step]
173
- for step in pre_steps
174
- if step in self.pipeline_components()
175
- ]
176
- pre_steps = make_pipeline(*pre_steps) if pre_steps else lambda x: x
177
 
178
- # lemmatization
179
- lemmatizer = lemmatizer if lemmatizer.lemmatization else lambda x: x
180
 
181
- # post lemmatization steps
182
- post_steps = [
183
- self.pipeline_components()[step]
184
- for step in post_steps
185
- if step in self.pipeline_components()
186
- ]
187
- post_steps = make_pipeline(*post_steps) if post_steps else lambda x: x
188
 
189
- return pre_steps, lemmatizer, post_steps
 
 
 
 
 
 
 
190
 
191
  @staticmethod
192
  def pipeline_components() -> "OrderedDict[str, Callable]":
193
  """Returns available cleaning steps in order"""
194
  return OrderedDict(
195
  [
196
- ("lower", lambda x: x.lower()),
197
  ("normalize_unicode", normalize.unicode),
198
  ("normalize_bullet_points", normalize.bullet_points),
199
  ("normalize_hyphenated_words", normalize.hyphenated_words),
200
  ("normalize_quotation_marks", normalize.quotation_marks),
201
- ("normalize_whitespace", normalize.whitespace),
202
  ("replace_urls", replace.urls),
203
  ("replace_currency_symbols", replace.currency_symbols),
204
  ("replace_emails", replace.emails),
@@ -216,6 +163,17 @@ class PreprocessingPipeline:
216
  ("normalize_useless_spaces", normalize_useless_spaces),
217
  ("normalize_repeating_chars", normalize_repeating_chars),
218
  ("normalize_repeating_words", normalize_repeating_words),
219
- ("strip", lambda x: x.strip()),
 
 
 
 
 
 
 
 
 
 
 
220
  ]
221
  )
 
1
+ import multiprocessing as mp
2
+ import os
3
  import re
4
  import string
5
  from collections import OrderedDict
6
+ from typing import Callable, List, Optional
7
 
 
8
  import pandas as pd
9
+ from pandas.core.frame import DataFrame
10
  import spacy
11
  import streamlit as st
12
+ import vaex
13
  from pandas.core.series import Series
 
 
 
14
  from textacy.preprocessing import make_pipeline, normalize, remove, replace
15
 
16
  from .configs import Languages
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
19
  # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
20
  # fmt: off
 
51
  return _re_wrep.sub(_replace_wrep, t)
52
 
53
 
54
+ def lowercase(t: str) -> str:
55
+ return t.lower()
 
56
 
 
 
 
 
 
 
 
 
 
57
 
58
+ def strip(t: str) -> str:
59
+ return t.strip()
 
 
 
60
 
 
 
 
 
61
 
62
+ def lemmatize_remove_stopwords(doc: spacy.tokens.doc.Doc) -> str:
63
+ return " ".join(
64
+ [t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop]
65
+ )
66
 
 
 
67
 
68
+ def remove_stopwords(doc: spacy.tokens.doc.Doc) -> str:
69
+ return " ".join([t.text for t in doc if not t.is_stop])
70
 
 
 
71
 
72
+ def lemmatize_keep_stopwords(doc: spacy.tokens.doc.Doc) -> str:
73
+ return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
 
76
+ # fmt: on
77
  class PreprocessingPipeline:
78
  def __init__(
79
+ self,
80
+ language: str,
81
+ pre_steps: Optional[List[str]],
82
+ lemmatization_step: Optional[str],
83
+ post_steps: Optional[List[str]],
84
  ):
85
+ self.language = language
86
+ self.pre_steps = pre_steps
87
+ self.lemmatization_step = lemmatization_step
88
+ self.post_steps = post_steps
89
 
90
+ self.nlp = spacy.load(Languages[language].value, disable=["parser", "ner"])
91
+ self.pre = self.make_pre_post_component(self.pre_steps)
92
+ self.post = self.make_pre_post_component(self.post_steps)
93
+ self.lemma = self.lemmatization_component()[self.lemmatization_step]
94
 
95
+ def apply_multiproc(fn, series):
96
+ with mp.Pool(mp.cpu_count()) as pool:
97
+ new_series = pool.map(fn, series)
98
+
99
+ return new_series
100
 
101
+ def vaex_process(self, df: DataFrame, text_column: str) -> DataFrame:
102
+ def fn(t):
103
+ return self.post(self.lemma(self.nlp(self.pre(t))))
104
 
105
+ vdf = vaex.from_pandas(df)
106
+ vdf["processed_text"] = vdf.apply(fn, arguments=[vdf[text_column]], vectorize=False)
107
 
108
+ return vdf.to_pandas_df()
109
 
110
+ def __call__(self, series: Series) -> Series:
111
+ if self.pre:
112
+ series = series.map(self.pre)
113
+
114
+ if self.lemma:
115
+ total_steps = len(series) // 100
116
+ res = []
117
+ pbar = st.progress(0)
118
+ for i, doc in enumerate(self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())):
119
+ res.append(self.lemma(doc))
120
 
121
+ if i % total_steps == 0:
122
+ pbar.progress(1)
 
 
 
 
 
123
 
124
+ series = pd.Series(res)
 
125
 
126
+ if self.post:
127
+ series = series.map(self.post)
 
 
 
 
 
128
 
129
+ return series
130
+
131
+ def make_pre_post_component(self, steps: Optional[List[str]]) -> Optional[Callable]:
132
+ if not steps:
133
+ return
134
+ components = [self.pipeline_components()[step] for step in steps]
135
+
136
+ return make_pipeline(*components)
137
 
138
  @staticmethod
139
  def pipeline_components() -> "OrderedDict[str, Callable]":
140
  """Returns available cleaning steps in order"""
141
  return OrderedDict(
142
  [
143
+ ("lowercase", lowercase),
144
  ("normalize_unicode", normalize.unicode),
145
  ("normalize_bullet_points", normalize.bullet_points),
146
  ("normalize_hyphenated_words", normalize.hyphenated_words),
147
  ("normalize_quotation_marks", normalize.quotation_marks),
148
+ ("normalize_whitespaces", normalize.whitespace),
149
  ("replace_urls", replace.urls),
150
  ("replace_currency_symbols", replace.currency_symbols),
151
  ("replace_emails", replace.emails),
 
163
  ("normalize_useless_spaces", normalize_useless_spaces),
164
  ("normalize_repeating_chars", normalize_repeating_chars),
165
  ("normalize_repeating_words", normalize_repeating_words),
166
+ ("strip", strip),
167
+ ]
168
+ )
169
+
170
+ @staticmethod
171
+ def lemmatization_component() -> "OrderedDict[str, Optional[Callable]]":
172
+ return OrderedDict(
173
+ [
174
+ ("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
175
+ ("Spacy lemmatizer (no stopwords)", lemmatize_remove_stopwords),
176
+ ("Disable lemmatizer", None),
177
+ ("Remove stopwords", remove_stopwords),
178
  ]
179
  )
src/utils.py CHANGED
@@ -3,11 +3,9 @@ import altair as alt
3
  import pandas as pd
4
  import streamlit as st
5
  from PIL import Image
6
- from stqdm import stqdm
7
 
8
  from .configs import SupportedFiles
9
 
10
- stqdm.pandas()
11
 
12
 
13
  @st.cache
@@ -15,20 +13,19 @@ def get_logo(path):
15
  return Image.open(path)
16
 
17
 
18
- # @st.cache(suppress_st_warning=True)
19
  @st.cache(allow_output_mutation=True)
20
  def read_file(uploaded_file) -> pd.DataFrame:
21
-
22
  file_type = uploaded_file.name.split(".")[-1]
23
- if file_type in set(i.name for i in SupportedFiles):
24
- read_f = SupportedFiles[file_type].value[0]
25
- df = read_f(uploaded_file)
26
- # remove any NA
27
- df = df.dropna()
28
- return df
29
 
30
- else:
31
- st.error("File type not supported")
 
 
32
 
33
 
34
  def download_button(dataframe: pd.DataFrame, name: str):
@@ -55,12 +52,7 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
55
 
56
  return
57
 
58
- source = (
59
- data[label_column]
60
- .value_counts()
61
- .reset_index()
62
- .rename(columns={"index": "Labels", label_column: "Counts"})
63
- )
64
  source["Props"] = source["Counts"] / source["Counts"].sum()
65
  source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
66
 
@@ -73,9 +65,7 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
73
  )
74
  )
75
 
76
- text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
77
- text="Proportions:O"
78
- )
79
 
80
  return (bars + text).properties(height=300)
81
 
@@ -87,9 +77,7 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
87
  alt.Chart(source)
88
  .mark_bar()
89
  .encode(
90
- alt.X(
91
- f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
92
- ),
93
  alt.Y("count()", axis=alt.Axis(title="")),
94
  )
95
  )
@@ -99,11 +87,7 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
99
 
100
  def plot_score(data: pd.DataFrame, label_col: str, label: str):
101
 
102
- source = (
103
- data.loc[data[label_col] == label]
104
- .sort_values("score", ascending=False)
105
- .head(100)
106
- )
107
 
108
  plot = (
109
  alt.Chart(source)
 
3
  import pandas as pd
4
  import streamlit as st
5
  from PIL import Image
 
6
 
7
  from .configs import SupportedFiles
8
 
 
9
 
10
 
11
  @st.cache
 
13
  return Image.open(path)
14
 
15
 
 
16
  @st.cache(allow_output_mutation=True)
17
  def read_file(uploaded_file) -> pd.DataFrame:
 
18
  file_type = uploaded_file.name.split(".")[-1]
19
+ read_fn = SupportedFiles[file_type].value[0]
20
+ df = read_fn(uploaded_file)
21
+ df = df.dropna()
22
+ return df
23
+
 
24
 
25
+ @st.cache
26
+ def convert_df(df):
27
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
28
+ return df.to_csv(index=False, sep=";").encode("utf-8")
29
 
30
 
31
  def download_button(dataframe: pd.DataFrame, name: str):
 
52
 
53
  return
54
 
55
+ source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
 
 
 
 
 
56
  source["Props"] = source["Counts"] / source["Counts"].sum()
57
  source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
58
 
 
65
  )
66
  )
67
 
68
+ text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
 
 
69
 
70
  return (bars + text).properties(height=300)
71
 
 
77
  alt.Chart(source)
78
  .mark_bar()
79
  .encode(
80
+ alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
 
 
81
  alt.Y("count()", axis=alt.Axis(title="")),
82
  )
83
  )
 
87
 
88
  def plot_score(data: pd.DataFrame, label_col: str, label: str):
89
 
90
+ source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
 
 
 
 
91
 
92
  plot = (
93
  alt.Chart(source)
src/wordifier.py CHANGED
@@ -1,17 +1,52 @@
1
- from typing import List
 
2
  import numpy as np
3
  import pandas as pd
4
  import streamlit as st
 
 
5
  from sklearn.linear_model import LogisticRegression
 
6
  from sklearn.utils import resample
7
- from stqdm import stqdm
8
 
9
- from .configs import ModelConfigs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- stqdm.pandas()
 
 
 
 
 
12
 
13
 
14
- def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
 
 
15
 
16
  n_instances, n_features = X.shape
17
  n_classes = len(y_names)
@@ -36,70 +71,62 @@ def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs
36
  pos_scores = np.zeros((n_classes, n_features), dtype=int)
37
  neg_scores = np.zeros((n_classes, n_features), dtype=int)
38
 
39
- with st.spinner("Wordifying!"):
40
-
41
- for _ in stqdm(range(configs.NUM_ITERS.value)):
42
-
43
- # run randomized regression
44
- clf = LogisticRegression(
45
- penalty="l1",
46
- C=configs.PENALTIES.value[
47
- np.random.randint(len(configs.PENALTIES.value))
48
- ],
49
- solver="liblinear",
50
- multi_class="auto",
51
- max_iter=500,
52
- class_weight="balanced",
53
- )
54
-
55
- # sample indices to subsample matrix
56
- selection = resample(
57
- np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
58
- )
59
-
60
- # fit
61
- try:
62
- clf.fit(X[selection], y[selection])
63
- except ValueError:
64
- continue
65
-
66
- # record coefficients
67
- if n_classes == 2:
68
- pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
69
- neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
70
- pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
71
- neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
72
- else:
73
- pos_scores += clf.coef_ > 0
74
- neg_scores += clf.coef_ < 0
75
-
76
- # normalize
77
- pos_scores = pos_scores / configs.NUM_ITERS.value
78
- neg_scores = neg_scores / configs.NUM_ITERS.value
79
-
80
- # get only active features
81
- pos_positions = np.where(
82
- pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
83
- )
84
- neg_positions = np.where(
85
- neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
86
  )
87
 
88
- # prepare DataFrame
89
- pos = [
90
- (X_names[i], pos_scores[c, i], y_names[c])
91
- for c, i in zip(*pos_positions.nonzero())
92
- ]
93
- neg = [
94
- (X_names[i], neg_scores[c, i], y_names[c])
95
- for c, i in zip(*neg_positions.nonzero())
96
- ]
97
-
98
- posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
99
- ["label", "score"], ascending=False
100
- )
101
- negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
102
- ["label", "score"], ascending=False
103
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- return posdf, negdf
 
1
+ from typing import Dict, List, Tuple
2
+
3
  import numpy as np
4
  import pandas as pd
5
  import streamlit as st
6
+ from pandas.core.frame import DataFrame
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.linear_model import LogisticRegression
9
+ from sklearn.preprocessing import LabelEncoder
10
  from sklearn.utils import resample
 
11
 
12
+ from .configs import InputTransformConfigs, ModelConfigs
13
+
14
+
15
+ def input_transform(text: pd.Series, labels: pd.Series, configs=InputTransformConfigs) -> Dict[str, np.ndarray]:
16
+ """
17
+ Encodes text in mathematical object ameanable to training algorithm
18
+ """
19
+ tfidf_vectorizer = TfidfVectorizer(
20
+ input="content", # default: file already in memory
21
+ encoding="utf-8", # default
22
+ decode_error="strict", # default
23
+ strip_accents=None, # do nothing
24
+ lowercase=False, # do nothing
25
+ preprocessor=None, # do nothing - default
26
+ tokenizer=None, # default
27
+ stop_words=None, # do nothing
28
+ analyzer="word",
29
+ ngram_range=configs.NGRAM_RANGE.value, # maximum 3-ngrams
30
+ min_df=configs.MIN_DF.value,
31
+ max_df=configs.MAX_DF.value,
32
+ sublinear_tf=configs.SUBLINEAR.value,
33
+ )
34
+ label_encoder = LabelEncoder()
35
+
36
+ X = tfidf_vectorizer.fit_transform(text.values)
37
+ y = label_encoder.fit_transform(labels.values)
38
 
39
+ return {
40
+ "X": X,
41
+ "y": y,
42
+ "X_names": np.array(tfidf_vectorizer.get_feature_names_out()),
43
+ "y_names": label_encoder.classes_,
44
+ }
45
 
46
 
47
+ def wordifier(
48
+ X: np.ndarray, y: np.ndarray, X_names: List[str], y_names: List[str], configs=ModelConfigs
49
+ ) -> List[Tuple[str, float, str]]:
50
 
51
  n_instances, n_features = X.shape
52
  n_classes = len(y_names)
 
71
  pos_scores = np.zeros((n_classes, n_features), dtype=int)
72
  neg_scores = np.zeros((n_classes, n_features), dtype=int)
73
 
74
+ pbar = st.progress(0)
75
+ for i, _ in enumerate(range(configs.NUM_ITERS.value)):
76
+
77
+ # run randomized regression
78
+ clf = LogisticRegression(
79
+ penalty="l1",
80
+ C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
81
+ solver="liblinear",
82
+ multi_class="auto",
83
+ max_iter=500,
84
+ class_weight="balanced",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  )
86
 
87
+ # sample indices to subsample matrix
88
+ selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
89
+
90
+ # fit
91
+ try:
92
+ clf.fit(X[selection], y[selection])
93
+ except ValueError:
94
+ continue
95
+
96
+ # record coefficients
97
+ if n_classes == 2:
98
+ pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
99
+ neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
100
+ pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
101
+ neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
102
+ else:
103
+ pos_scores += clf.coef_ > 0
104
+ neg_scores += clf.coef_ < 0
105
+
106
+ pbar.progress(round(i / configs.NUM_ITERS.value, 1))
107
+
108
+ # normalize
109
+ pos_scores = pos_scores / configs.NUM_ITERS.value
110
+ neg_scores = neg_scores / configs.NUM_ITERS.value
111
+
112
+ # get only active features
113
+ pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
114
+ neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
115
+
116
+ # prepare DataFrame
117
+ pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
118
+ neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
119
+
120
+ return pos, neg
121
+
122
+
123
+ def output_transform(pos: List[Tuple[str, float, str]], neg: List[Tuple[str, float, str]]) -> DataFrame:
124
+ posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
125
+ posdf["correlation"] = "positive"
126
+ negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
127
+ negdf["correlation"] = "negative"
128
+
129
+ output = pd.concat([posdf, negdf], ignore_index=False, axis=0)
130
+ output.columns = output.columns.str.title()
131
 
132
+ return output