Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
02c2d7e
1
Parent(s):
a66b528
divide into individual files
Browse files- src/plotting.py +71 -0
- src/preprocessing.py +200 -0
- src/utils.py +9 -260
- src/wordifier.py +87 -0
src/plotting.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import altair as alt
|
2 |
+
import pandas as pd
|
3 |
+
import streamlit as st
|
4 |
+
from stqdm import stqdm
|
5 |
+
|
6 |
+
stqdm.pandas()
|
7 |
+
|
8 |
+
|
9 |
+
def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
10 |
+
|
11 |
+
unique_value_limit = 100
|
12 |
+
|
13 |
+
if data[label_column].nunique() > unique_value_limit:
|
14 |
+
|
15 |
+
st.warning(
|
16 |
+
f"""
|
17 |
+
The column you selected has more than {unique_value_limit}.
|
18 |
+
Are you sure it's the right column? If it is, please note that
|
19 |
+
this will impact __Wordify__ performance.
|
20 |
+
"""
|
21 |
+
)
|
22 |
+
|
23 |
+
return
|
24 |
+
|
25 |
+
source = data[label_column].value_counts().reset_index().rename(columns={"index": "Labels", label_column: "Counts"})
|
26 |
+
source["Props"] = source["Counts"] / source["Counts"].sum()
|
27 |
+
source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
|
28 |
+
|
29 |
+
bars = (
|
30 |
+
alt.Chart(source)
|
31 |
+
.mark_bar()
|
32 |
+
.encode(
|
33 |
+
x=alt.X("Labels:O", sort="-y"),
|
34 |
+
y="Counts:Q",
|
35 |
+
)
|
36 |
+
)
|
37 |
+
|
38 |
+
text = bars.mark_text(align="center", baseline="middle", dy=15).encode(text="Proportions:O")
|
39 |
+
|
40 |
+
return (bars + text).properties(height=300)
|
41 |
+
|
42 |
+
|
43 |
+
def plot_nchars(data: pd.DataFrame, text_column: str):
|
44 |
+
source = data[text_column].str.len().to_frame()
|
45 |
+
|
46 |
+
plot = (
|
47 |
+
alt.Chart(source)
|
48 |
+
.mark_bar()
|
49 |
+
.encode(
|
50 |
+
alt.X(f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")),
|
51 |
+
alt.Y("count()", axis=alt.Axis(title="")),
|
52 |
+
)
|
53 |
+
)
|
54 |
+
|
55 |
+
return plot.properties(height=300)
|
56 |
+
|
57 |
+
|
58 |
+
def plot_score(data: pd.DataFrame, label_col: str, label: str):
|
59 |
+
|
60 |
+
source = data.loc[data[label_col] == label].sort_values("score", ascending=False).head(100)
|
61 |
+
|
62 |
+
plot = (
|
63 |
+
alt.Chart(source)
|
64 |
+
.mark_bar()
|
65 |
+
.encode(
|
66 |
+
y=alt.Y("word:O", sort="-x"),
|
67 |
+
x="score:Q",
|
68 |
+
)
|
69 |
+
)
|
70 |
+
|
71 |
+
return plot.properties(height=max(30 * source.shape[0], 50))
|
src/preprocessing.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
from collections import OrderedDict
|
4 |
+
from typing import Callable, Dict, List
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
import spacy
|
9 |
+
import streamlit as st
|
10 |
+
from pandas.core.series import Series
|
11 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
12 |
+
from sklearn.preprocessing import LabelEncoder
|
13 |
+
from stqdm import stqdm
|
14 |
+
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
15 |
+
|
16 |
+
from .configs import Languages
|
17 |
+
|
18 |
+
stqdm.pandas()
|
19 |
+
|
20 |
+
|
21 |
+
def encode(text: pd.Series, labels: pd.Series):
|
22 |
+
"""
|
23 |
+
Encodes text in mathematical object ameanable to training algorithm
|
24 |
+
"""
|
25 |
+
tfidf_vectorizer = TfidfVectorizer(
|
26 |
+
input="content", # default: file already in memory
|
27 |
+
encoding="utf-8", # default
|
28 |
+
decode_error="strict", # default
|
29 |
+
strip_accents=None, # do nothing
|
30 |
+
lowercase=False, # do nothing
|
31 |
+
preprocessor=None, # do nothing - default
|
32 |
+
tokenizer=None, # default
|
33 |
+
stop_words=None, # do nothing
|
34 |
+
analyzer="word",
|
35 |
+
ngram_range=(1, 3), # maximum 3-ngrams
|
36 |
+
min_df=0.001,
|
37 |
+
max_df=0.75,
|
38 |
+
sublinear_tf=True,
|
39 |
+
)
|
40 |
+
label_encoder = LabelEncoder()
|
41 |
+
|
42 |
+
with st.spinner("Encoding text using TF-IDF and Encoding labels"):
|
43 |
+
X = tfidf_vectorizer.fit_transform(text.values)
|
44 |
+
y = label_encoder.fit_transform(labels.values)
|
45 |
+
|
46 |
+
return {
|
47 |
+
"X": X,
|
48 |
+
"y": y,
|
49 |
+
"X_names": np.array(tfidf_vectorizer.get_feature_names()),
|
50 |
+
"y_names": label_encoder.classes_,
|
51 |
+
}
|
52 |
+
|
53 |
+
|
54 |
+
# more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
|
55 |
+
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
56 |
+
# fmt: off
|
57 |
+
_re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
|
58 |
+
def normalize_acronyms(t):
|
59 |
+
return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
|
60 |
+
|
61 |
+
|
62 |
+
_re_non_word = re.compile(r"\W")
|
63 |
+
def remove_non_word(t):
|
64 |
+
return _re_non_word.sub(" ", t)
|
65 |
+
|
66 |
+
|
67 |
+
_re_space = re.compile(r" {2,}")
|
68 |
+
def normalize_useless_spaces(t):
|
69 |
+
return _re_space.sub(" ", t)
|
70 |
+
|
71 |
+
|
72 |
+
_re_rep = re.compile(r"(\S)(\1{2,})")
|
73 |
+
def normalize_repeating_chars(t):
|
74 |
+
def _replace_rep(m):
|
75 |
+
c, cc = m.groups()
|
76 |
+
return c
|
77 |
+
|
78 |
+
return _re_rep.sub(_replace_rep, t)
|
79 |
+
|
80 |
+
|
81 |
+
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
82 |
+
def normalize_repeating_words(t):
|
83 |
+
def _replace_wrep(m):
|
84 |
+
c, cc, e = m.groups()
|
85 |
+
return c
|
86 |
+
|
87 |
+
return _re_wrep.sub(_replace_wrep, t)
|
88 |
+
|
89 |
+
# fmt: on
|
90 |
+
class TextPreprocessor:
|
91 |
+
def __init__(
|
92 |
+
self,
|
93 |
+
language: str,
|
94 |
+
cleaning_steps: List[str],
|
95 |
+
lemmatizer_when: str = "last",
|
96 |
+
remove_stop: bool = True,
|
97 |
+
) -> None:
|
98 |
+
|
99 |
+
# prepare lemmatizer
|
100 |
+
self.language = language
|
101 |
+
self.nlp = spacy.load(
|
102 |
+
Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
|
103 |
+
)
|
104 |
+
self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
|
105 |
+
self.remove_stop = remove_stop
|
106 |
+
self._lemmatize = self._get_lemmatizer()
|
107 |
+
|
108 |
+
# prepare cleaning
|
109 |
+
self.cleaning_steps = [
|
110 |
+
self._cleaning_options()[step]
|
111 |
+
for step in cleaning_steps
|
112 |
+
if step in self._cleaning_options()
|
113 |
+
]
|
114 |
+
self.cleaning_pipeline = (
|
115 |
+
make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
|
116 |
+
)
|
117 |
+
|
118 |
+
def _get_lemmatizer(self) -> Callable:
|
119 |
+
"""Return the correct spacy Doc-level lemmatizer"""
|
120 |
+
if self.remove_stop:
|
121 |
+
|
122 |
+
def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
|
123 |
+
"""Lemmatizes spacy Doc and removes stopwords"""
|
124 |
+
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
|
125 |
+
|
126 |
+
else:
|
127 |
+
|
128 |
+
def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
|
129 |
+
"""Lemmatizes spacy Doc"""
|
130 |
+
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
|
131 |
+
|
132 |
+
return lemmatizer
|
133 |
+
|
134 |
+
@staticmethod
|
135 |
+
def _lemmatization_options() -> Dict[str, str]:
|
136 |
+
return {
|
137 |
+
"Before preprocessing": "first",
|
138 |
+
"After preprocessing": "last",
|
139 |
+
"Never! Let's do it quick and dirty": None,
|
140 |
+
}
|
141 |
+
|
142 |
+
def lemmatizer(self, series: pd.Series) -> pd.Series:
|
143 |
+
"""
|
144 |
+
Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
|
145 |
+
"""
|
146 |
+
res = []
|
147 |
+
pbar = stqdm(total=len(series))
|
148 |
+
for doc in self.nlp.pipe(series, batch_size=500):
|
149 |
+
res.append(self._lemmatize(doc))
|
150 |
+
pbar.update(1)
|
151 |
+
pbar.close()
|
152 |
+
return pd.Series(res)
|
153 |
+
|
154 |
+
@staticmethod
|
155 |
+
def _cleaning_options():
|
156 |
+
"""Returns available cleaning steps in order"""
|
157 |
+
return OrderedDict(
|
158 |
+
[
|
159 |
+
("lower", lambda x: x.lower()),
|
160 |
+
("normalize_unicode", normalize.unicode),
|
161 |
+
("normalize_bullet_points", normalize.bullet_points),
|
162 |
+
("normalize_hyphenated_words", normalize.hyphenated_words),
|
163 |
+
("normalize_quotation_marks", normalize.quotation_marks),
|
164 |
+
("normalize_whitespace", normalize.whitespace),
|
165 |
+
("replace_urls", replace.urls),
|
166 |
+
("replace_currency_symbols", replace.currency_symbols),
|
167 |
+
("replace_emails", replace.emails),
|
168 |
+
("replace_emojis", replace.emojis),
|
169 |
+
("replace_hashtags", replace.hashtags),
|
170 |
+
("replace_numbers", replace.numbers),
|
171 |
+
("replace_phone_numbers", replace.phone_numbers),
|
172 |
+
("replace_user_handles", replace.user_handles),
|
173 |
+
("normalize_acronyms", normalize_acronyms),
|
174 |
+
("remove_accents", remove.accents),
|
175 |
+
("remove_brackets", remove.brackets),
|
176 |
+
("remove_html_tags", remove.html_tags),
|
177 |
+
("remove_punctuation", remove.punctuation),
|
178 |
+
("remove_non_words", remove_non_word),
|
179 |
+
("normalize_useless_spaces", normalize_useless_spaces),
|
180 |
+
("normalize_repeating_chars", normalize_repeating_chars),
|
181 |
+
("normalize_repeating_words", normalize_repeating_words),
|
182 |
+
("strip", lambda x: x.strip()),
|
183 |
+
]
|
184 |
+
)
|
185 |
+
|
186 |
+
def fit_transform(self, series: pd.Series) -> Series:
|
187 |
+
"""Applies text preprocessing"""
|
188 |
+
|
189 |
+
if self.lemmatizer_when == "first":
|
190 |
+
with st.spinner("Lemmatizing"):
|
191 |
+
series = self.lemmatizer(series)
|
192 |
+
|
193 |
+
with st.spinner("Cleaning"):
|
194 |
+
series = series.progress_map(self.cleaning_pipeline)
|
195 |
+
|
196 |
+
if self.lemmatizer_when == "last":
|
197 |
+
with st.spinner("Lemmatizing"):
|
198 |
+
series = self.lemmatizer(series)
|
199 |
+
|
200 |
+
return series
|
src/utils.py
CHANGED
@@ -1,24 +1,12 @@
|
|
1 |
import base64
|
2 |
-
import re
|
3 |
-
from collections import OrderedDict
|
4 |
-
from typing import Callable, Dict, List
|
5 |
-
|
6 |
import altair as alt
|
7 |
-
import numpy as np
|
8 |
import pandas as pd
|
9 |
-
import spacy
|
10 |
import streamlit as st
|
11 |
-
from pandas.core.series import Series
|
12 |
from PIL import Image
|
13 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
14 |
-
from sklearn.linear_model import LogisticRegression
|
15 |
-
from sklearn.preprocessing import LabelEncoder
|
16 |
-
from sklearn.utils import resample
|
17 |
from stqdm import stqdm
|
18 |
-
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
19 |
|
20 |
-
from .configs import
|
21 |
-
|
22 |
stqdm.pandas()
|
23 |
|
24 |
|
@@ -27,7 +15,7 @@ def get_logo(path):
|
|
27 |
return Image.open(path)
|
28 |
|
29 |
|
30 |
-
# @st.cache(suppress_st_warning=True)
|
31 |
@st.cache(allow_output_mutation=True)
|
32 |
def read_file(uploaded_file) -> pd.DataFrame:
|
33 |
|
@@ -51,258 +39,19 @@ def download_button(dataframe: pd.DataFrame, name: str):
|
|
51 |
st.write(href, unsafe_allow_html=True)
|
52 |
|
53 |
|
54 |
-
def encode(text: pd.Series, labels: pd.Series):
|
55 |
-
tfidf_vectorizer = TfidfVectorizer(
|
56 |
-
input="content", # default: file already in memory
|
57 |
-
encoding="utf-8", # default
|
58 |
-
decode_error="strict", # default
|
59 |
-
strip_accents=None, # do nothing
|
60 |
-
lowercase=False, # do nothing
|
61 |
-
preprocessor=None, # do nothing - default
|
62 |
-
tokenizer=None, # default
|
63 |
-
stop_words=None, # do nothing
|
64 |
-
analyzer="word",
|
65 |
-
ngram_range=(1, 3), # maximum 3-ngrams
|
66 |
-
min_df=0.001,
|
67 |
-
max_df=0.75,
|
68 |
-
sublinear_tf=True,
|
69 |
-
)
|
70 |
-
label_encoder = LabelEncoder()
|
71 |
-
|
72 |
-
with st.spinner("Encoding text using TF-IDF and Encoding labels"):
|
73 |
-
X = tfidf_vectorizer.fit_transform(text.values)
|
74 |
-
y = label_encoder.fit_transform(labels.values)
|
75 |
-
|
76 |
-
return {
|
77 |
-
"X": X,
|
78 |
-
"y": y,
|
79 |
-
"X_names": np.array(tfidf_vectorizer.get_feature_names()),
|
80 |
-
"y_names": label_encoder.classes_,
|
81 |
-
}
|
82 |
-
|
83 |
-
|
84 |
-
def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
|
85 |
-
|
86 |
-
n_instances, n_features = X.shape
|
87 |
-
n_classes = len(y_names)
|
88 |
-
|
89 |
-
# NOTE: the * 10 / 10 trick is to have "nice" round-ups
|
90 |
-
sample_fraction = np.ceil((n_features / n_instances) * 10) / 10
|
91 |
-
|
92 |
-
sample_size = min(
|
93 |
-
# this is the maximum supported
|
94 |
-
configs.MAX_SELECTION.value,
|
95 |
-
# at minimum you want MIN_SELECTION but in general you want
|
96 |
-
# n_instances * sample_fraction
|
97 |
-
max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
|
98 |
-
# however if previous one is bigger the the available instances take
|
99 |
-
# the number of available instances
|
100 |
-
n_instances,
|
101 |
-
)
|
102 |
-
|
103 |
-
# TODO: might want to try out something to subsample features at each iteration
|
104 |
-
|
105 |
-
# initialize coefficient matrices
|
106 |
-
pos_scores = np.zeros((n_classes, n_features), dtype=int)
|
107 |
-
neg_scores = np.zeros((n_classes, n_features), dtype=int)
|
108 |
-
|
109 |
-
with st.spinner("Wordifying!"):
|
110 |
-
|
111 |
-
for _ in stqdm(range(configs.NUM_ITERS.value)):
|
112 |
-
|
113 |
-
# run randomized regression
|
114 |
-
clf = LogisticRegression(
|
115 |
-
penalty="l1",
|
116 |
-
C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
|
117 |
-
solver="liblinear",
|
118 |
-
multi_class="auto",
|
119 |
-
max_iter=500,
|
120 |
-
class_weight="balanced",
|
121 |
-
)
|
122 |
-
|
123 |
-
# sample indices to subsample matrix
|
124 |
-
selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
|
125 |
-
|
126 |
-
# fit
|
127 |
-
try:
|
128 |
-
clf.fit(X[selection], y[selection])
|
129 |
-
except ValueError:
|
130 |
-
continue
|
131 |
-
|
132 |
-
# record coefficients
|
133 |
-
if n_classes == 2:
|
134 |
-
pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
|
135 |
-
neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
|
136 |
-
pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
|
137 |
-
neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
|
138 |
-
else:
|
139 |
-
pos_scores += clf.coef_ > 0
|
140 |
-
neg_scores += clf.coef_ < 0
|
141 |
-
|
142 |
-
# normalize
|
143 |
-
pos_scores = pos_scores / configs.NUM_ITERS.value
|
144 |
-
neg_scores = neg_scores / configs.NUM_ITERS.value
|
145 |
-
|
146 |
-
# get only active features
|
147 |
-
pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
|
148 |
-
neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
|
149 |
-
|
150 |
-
# prepare DataFrame
|
151 |
-
pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
|
152 |
-
neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
|
153 |
-
|
154 |
-
posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
|
155 |
-
negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
|
156 |
-
|
157 |
-
return posdf, negdf
|
158 |
-
|
159 |
-
|
160 |
-
# more [here](https://github.com/fastai/fastai/blob/master/fastai/text/core.py#L42)
|
161 |
-
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
162 |
-
_re_normalize_acronyms = re.compile("(?:[a-zA-Z]\.){2,}")
|
163 |
-
def normalize_acronyms(t):
|
164 |
-
return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
|
165 |
-
|
166 |
-
_re_non_word = re.compile("\W")
|
167 |
-
def remove_non_word(t):
|
168 |
-
return _re_non_word.sub(" ", t)
|
169 |
-
|
170 |
-
_re_space = re.compile(" {2,}")
|
171 |
-
def normalize_useless_spaces(t):
|
172 |
-
return _re_space.sub(" ", t)
|
173 |
-
|
174 |
-
|
175 |
-
_re_rep = re.compile(r"(\S)(\1{2,})")
|
176 |
-
def normalize_repeating_chars(t):
|
177 |
-
def _replace_rep(m):
|
178 |
-
c, cc = m.groups()
|
179 |
-
return c
|
180 |
-
|
181 |
-
return _re_rep.sub(_replace_rep, t)
|
182 |
-
|
183 |
-
|
184 |
-
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
185 |
-
def normalize_repeating_words(t):
|
186 |
-
def _replace_wrep(m):
|
187 |
-
c, cc, e = m.groups()
|
188 |
-
return c
|
189 |
-
|
190 |
-
return _re_wrep.sub(_replace_wrep, t)
|
191 |
-
|
192 |
-
|
193 |
-
class TextPreprocessor:
|
194 |
-
def __init__(
|
195 |
-
self, language: str, cleaning_steps: List[str], lemmatizer_when: str = "last", remove_stop: bool = True
|
196 |
-
) -> None:
|
197 |
-
# prepare lemmatizer
|
198 |
-
self.language = language
|
199 |
-
self.nlp = spacy.load(Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"])
|
200 |
-
self.lemmatizer_when = self._lemmatization_options().get(lemmatizer_when, None)
|
201 |
-
self.remove_stop = remove_stop
|
202 |
-
self._lemmatize = self._get_lemmatizer()
|
203 |
-
|
204 |
-
# prepare cleaning
|
205 |
-
self.cleaning_steps = [
|
206 |
-
self._cleaning_options()[step] for step in cleaning_steps if step in self._cleaning_options()
|
207 |
-
]
|
208 |
-
self.cleaning_pipeline = make_pipeline(*self.cleaning_steps) if self.cleaning_steps else lambda x: x
|
209 |
-
|
210 |
-
def _get_lemmatizer(self) -> Callable:
|
211 |
-
"""Return the correct spacy Doc-level lemmatizer"""
|
212 |
-
if self.remove_stop:
|
213 |
-
|
214 |
-
def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
|
215 |
-
"""Lemmatizes spacy Doc and removes stopwords"""
|
216 |
-
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
|
217 |
-
|
218 |
-
else:
|
219 |
-
|
220 |
-
def lemmatizer(doc: spacy.tokens.doc.Doc) -> str:
|
221 |
-
"""Lemmatizes spacy Doc"""
|
222 |
-
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
|
223 |
-
|
224 |
-
return lemmatizer
|
225 |
-
|
226 |
-
@staticmethod
|
227 |
-
def _lemmatization_options() -> Dict[str, str]:
|
228 |
-
return {
|
229 |
-
"Before preprocessing": "first",
|
230 |
-
"After preprocessing": "last",
|
231 |
-
"Never! Let's do it quick and dirty": None,
|
232 |
-
}
|
233 |
-
|
234 |
-
def lemmatizer(self, series: pd.Series) -> pd.Series:
|
235 |
-
"""
|
236 |
-
Apply spacy pipeline to transform string to spacy Doc and applies lemmatization
|
237 |
-
"""
|
238 |
-
res = []
|
239 |
-
pbar = stqdm(total=len(series))
|
240 |
-
for doc in self.nlp.pipe(series, batch_size=500):
|
241 |
-
res.append(self._lemmatize(doc))
|
242 |
-
pbar.update(1)
|
243 |
-
pbar.close()
|
244 |
-
return pd.Series(res)
|
245 |
-
|
246 |
-
@staticmethod
|
247 |
-
def _cleaning_options():
|
248 |
-
"""Returns available cleaning steps in order"""
|
249 |
-
return OrderedDict(
|
250 |
-
[
|
251 |
-
("lower", lambda x: x.lower()),
|
252 |
-
("normalize_unicode", normalize.unicode),
|
253 |
-
("normalize_bullet_points", normalize.bullet_points),
|
254 |
-
("normalize_hyphenated_words", normalize.hyphenated_words),
|
255 |
-
("normalize_quotation_marks", normalize.quotation_marks),
|
256 |
-
("normalize_whitespace", normalize.whitespace),
|
257 |
-
("replace_urls", replace.urls),
|
258 |
-
("replace_currency_symbols", replace.currency_symbols),
|
259 |
-
("replace_emails", replace.emails),
|
260 |
-
("replace_emojis", replace.emojis),
|
261 |
-
("replace_hashtags", replace.hashtags),
|
262 |
-
("replace_numbers", replace.numbers),
|
263 |
-
("replace_phone_numbers", replace.phone_numbers),
|
264 |
-
("replace_user_handles", replace.user_handles),
|
265 |
-
("normalize_acronyms", normalize_acronyms),
|
266 |
-
("remove_accents", remove.accents),
|
267 |
-
("remove_brackets", remove.brackets),
|
268 |
-
("remove_html_tags", remove.html_tags),
|
269 |
-
("remove_punctuation", remove.punctuation),
|
270 |
-
("remove_non_words", remove_non_word),
|
271 |
-
("normalize_useless_spaces", normalize_useless_spaces),
|
272 |
-
("normalize_repeating_chars", normalize_repeating_chars),
|
273 |
-
("normalize_repeating_words", normalize_repeating_words),
|
274 |
-
("strip", lambda x: x.strip()),
|
275 |
-
]
|
276 |
-
)
|
277 |
-
|
278 |
-
def fit_transform(self, series: pd.Series) -> Series:
|
279 |
-
"""Applies text preprocessing"""
|
280 |
-
|
281 |
-
if self.lemmatizer_when == "first":
|
282 |
-
with st.spinner("Lemmatizing"):
|
283 |
-
series = self.lemmatizer(series)
|
284 |
-
|
285 |
-
with st.spinner("Cleaning"):
|
286 |
-
series = series.progress_map(self.cleaning_pipeline)
|
287 |
-
|
288 |
-
if self.lemmatizer_when == "last":
|
289 |
-
with st.spinner("Lemmatizing"):
|
290 |
-
series = self.lemmatizer(series)
|
291 |
-
|
292 |
-
return series
|
293 |
-
|
294 |
-
|
295 |
def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
296 |
|
297 |
unique_value_limit = 100
|
298 |
-
|
299 |
if data[label_column].nunique() > unique_value_limit:
|
300 |
|
301 |
-
st.warning(
|
302 |
-
|
|
|
303 |
Are you sure it's the right column? If it is, please note that
|
304 |
this will impact __Wordify__ performance.
|
305 |
-
"""
|
|
|
306 |
|
307 |
return
|
308 |
|
|
|
1 |
import base64
|
|
|
|
|
|
|
|
|
2 |
import altair as alt
|
|
|
3 |
import pandas as pd
|
|
|
4 |
import streamlit as st
|
|
|
5 |
from PIL import Image
|
|
|
|
|
|
|
|
|
6 |
from stqdm import stqdm
|
|
|
7 |
|
8 |
+
from .configs import SupportedFiles
|
9 |
+
|
10 |
stqdm.pandas()
|
11 |
|
12 |
|
|
|
15 |
return Image.open(path)
|
16 |
|
17 |
|
18 |
+
# @st.cache(suppress_st_warning=True)
|
19 |
@st.cache(allow_output_mutation=True)
|
20 |
def read_file(uploaded_file) -> pd.DataFrame:
|
21 |
|
|
|
39 |
st.write(href, unsafe_allow_html=True)
|
40 |
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
43 |
|
44 |
unique_value_limit = 100
|
45 |
+
|
46 |
if data[label_column].nunique() > unique_value_limit:
|
47 |
|
48 |
+
st.warning(
|
49 |
+
f"""
|
50 |
+
The column you selected has more than {unique_value_limit}.
|
51 |
Are you sure it's the right column? If it is, please note that
|
52 |
this will impact __Wordify__ performance.
|
53 |
+
"""
|
54 |
+
)
|
55 |
|
56 |
return
|
57 |
|
src/wordifier.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import streamlit as st
|
5 |
+
from sklearn.linear_model import LogisticRegression
|
6 |
+
from sklearn.utils import resample
|
7 |
+
from stqdm import stqdm
|
8 |
+
|
9 |
+
from .configs import ModelConfigs
|
10 |
+
|
11 |
+
stqdm.pandas()
|
12 |
+
|
13 |
+
|
14 |
+
def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
|
15 |
+
|
16 |
+
n_instances, n_features = X.shape
|
17 |
+
n_classes = len(y_names)
|
18 |
+
|
19 |
+
# NOTE: the * 10 / 10 trick is to have "nice" round-ups
|
20 |
+
sample_fraction = np.ceil((n_features / n_instances) * 10) / 10
|
21 |
+
|
22 |
+
sample_size = min(
|
23 |
+
# this is the maximum supported
|
24 |
+
configs.MAX_SELECTION.value,
|
25 |
+
# at minimum you want MIN_SELECTION but in general you want
|
26 |
+
# n_instances * sample_fraction
|
27 |
+
max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
|
28 |
+
# however if previous one is bigger the the available instances take
|
29 |
+
# the number of available instances
|
30 |
+
n_instances,
|
31 |
+
)
|
32 |
+
|
33 |
+
# TODO: might want to try out something to subsample features at each iteration
|
34 |
+
|
35 |
+
# initialize coefficient matrices
|
36 |
+
pos_scores = np.zeros((n_classes, n_features), dtype=int)
|
37 |
+
neg_scores = np.zeros((n_classes, n_features), dtype=int)
|
38 |
+
|
39 |
+
with st.spinner("Wordifying!"):
|
40 |
+
|
41 |
+
for _ in stqdm(range(configs.NUM_ITERS.value)):
|
42 |
+
|
43 |
+
# run randomized regression
|
44 |
+
clf = LogisticRegression(
|
45 |
+
penalty="l1",
|
46 |
+
C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],
|
47 |
+
solver="liblinear",
|
48 |
+
multi_class="auto",
|
49 |
+
max_iter=500,
|
50 |
+
class_weight="balanced",
|
51 |
+
)
|
52 |
+
|
53 |
+
# sample indices to subsample matrix
|
54 |
+
selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)
|
55 |
+
|
56 |
+
# fit
|
57 |
+
try:
|
58 |
+
clf.fit(X[selection], y[selection])
|
59 |
+
except ValueError:
|
60 |
+
continue
|
61 |
+
|
62 |
+
# record coefficients
|
63 |
+
if n_classes == 2:
|
64 |
+
pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
|
65 |
+
neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
|
66 |
+
pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
|
67 |
+
neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
|
68 |
+
else:
|
69 |
+
pos_scores += clf.coef_ > 0
|
70 |
+
neg_scores += clf.coef_ < 0
|
71 |
+
|
72 |
+
# normalize
|
73 |
+
pos_scores = pos_scores / configs.NUM_ITERS.value
|
74 |
+
neg_scores = neg_scores / configs.NUM_ITERS.value
|
75 |
+
|
76 |
+
# get only active features
|
77 |
+
pos_positions = np.where(pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0)
|
78 |
+
neg_positions = np.where(neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0)
|
79 |
+
|
80 |
+
# prepare DataFrame
|
81 |
+
pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]
|
82 |
+
neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]
|
83 |
+
|
84 |
+
posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
|
85 |
+
negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(["label", "score"], ascending=False)
|
86 |
+
|
87 |
+
return posdf, negdf
|