Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
54434cf
1
Parent(s):
9608f9f
add description
Browse files- src/components.py +173 -102
src/components.py
CHANGED
@@ -65,16 +65,12 @@ def form(df):
|
|
65 |
pre_steps = st.multiselect(
|
66 |
"Select pre-lemmatization processing steps (ordered)",
|
67 |
options=steps_options,
|
68 |
-
default=[
|
69 |
-
steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
|
70 |
-
],
|
71 |
format_func=lambda x: x.replace("_", " ").title(),
|
72 |
help="Select the processing steps to apply before the text is lemmatized",
|
73 |
)
|
74 |
|
75 |
-
lammatization_options = list(
|
76 |
-
PreprocessingPipeline.lemmatization_component().keys()
|
77 |
-
)
|
78 |
lemmatization_step = st.selectbox(
|
79 |
"Select lemmatization",
|
80 |
options=lammatization_options,
|
@@ -85,10 +81,7 @@ def form(df):
|
|
85 |
post_steps = st.multiselect(
|
86 |
"Select post-lemmatization processing steps (ordered)",
|
87 |
options=steps_options,
|
88 |
-
default=[
|
89 |
-
steps_options[i]
|
90 |
-
for i in PreprocessingConfigs.DEFAULT_POST.value
|
91 |
-
],
|
92 |
format_func=lambda x: x.replace("_", " ").title(),
|
93 |
help="Select the processing steps to apply after the text is lemmatized",
|
94 |
)
|
@@ -100,31 +93,21 @@ def form(df):
|
|
100 |
start_time = time.time()
|
101 |
|
102 |
# warnings about inputs
|
103 |
-
language_specific_warnings(
|
104 |
-
pre_steps, post_steps, lemmatization_step, language
|
105 |
-
)
|
106 |
|
107 |
# preprocess
|
108 |
if not disable_preprocessing:
|
109 |
with st.spinner("Step 1/4: Preprocessing text"):
|
110 |
-
pipe = PreprocessingPipeline(
|
111 |
-
language, pre_steps, lemmatization_step, post_steps
|
112 |
-
)
|
113 |
df = pipe.vaex_process(df, text_column)
|
114 |
else:
|
115 |
-
with st.spinner(
|
116 |
-
|
117 |
-
):
|
118 |
-
df = df.rename(
|
119 |
-
columns={text_column: ColumnNames.PROCESSED_TEXT.value}
|
120 |
-
)
|
121 |
time.sleep(1.2)
|
122 |
|
123 |
# prepare input
|
124 |
with st.spinner("Step 2/4: Preparing inputs"):
|
125 |
-
input_dict = input_transform(
|
126 |
-
df[ColumnNames.PROCESSED_TEXT.value], df[label_column]
|
127 |
-
)
|
128 |
|
129 |
# wordify
|
130 |
with st.spinner("Step 3/4: Wordifying"):
|
@@ -146,6 +129,168 @@ def form(df):
|
|
146 |
return new_df, meta_data
|
147 |
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
def faq():
|
150 |
st.subheader("Frequently Asked Questions")
|
151 |
with st.expander("What is Wordify?"):
|
@@ -249,75 +394,6 @@ def faq():
|
|
249 |
st.markdown(contacts(), unsafe_allow_html=True)
|
250 |
|
251 |
|
252 |
-
def presentation():
|
253 |
-
st.markdown(
|
254 |
-
"""
|
255 |
-
Wordify makes it easy to identify words that discriminate categories in textual data.
|
256 |
-
|
257 |
-
:point_left: Start by uploading a file. *Once you upload the file, __Wordify__ will
|
258 |
-
show an interactive UI*.
|
259 |
-
"""
|
260 |
-
)
|
261 |
-
|
262 |
-
st.subheader("Quickstart")
|
263 |
-
st.markdown(
|
264 |
-
"""
|
265 |
-
- There is no need to preprocess your text, we will take care of it. However, if you wish to
|
266 |
-
do so, turn off preprocessing in the `Advanced Settings` in the interactive UI.
|
267 |
-
|
268 |
-
- We expect a file with two columns: `label` with the labels and `text` with the texts (the names are case insensitive). If
|
269 |
-
you provide a file following this naming convention, Wordify will automatically select the
|
270 |
-
correct columns. However, if you wish to use a different nomenclature, you will be asked to
|
271 |
-
provide the column names in the interactive UI.
|
272 |
-
|
273 |
-
- Maintain a stable connection with the Wordify page until you download the data. If you refresh the page,
|
274 |
-
a new Wordify session is created and your progress is lost.
|
275 |
-
|
276 |
-
- Wordify performances depend on the length of the individual texts in your file. The longer the texts, the higher
|
277 |
-
the chance that Wordify considers many n-grams. More n-grams means more data to analyse in each run.
|
278 |
-
We tailored Wordify performance for files of approximately 5'000 lines or 50k n-grams. In such cases we expect a runtime
|
279 |
-
between 90 seconds and 10 minutes. If your file is big, try to apply a stricter preprocessing of the text in the `Advanced Options` section.
|
280 |
-
If this is not enough, please do feel free to reach out to us directly so we can help.
|
281 |
-
"""
|
282 |
-
)
|
283 |
-
|
284 |
-
st.subheader("Input format")
|
285 |
-
st.markdown(
|
286 |
-
"""
|
287 |
-
Please note that your file must have a column with the texts and a column with the labels,
|
288 |
-
for example
|
289 |
-
"""
|
290 |
-
)
|
291 |
-
st.table(
|
292 |
-
{
|
293 |
-
"text": ["A review", "Another review", "Yet another one", "etc"],
|
294 |
-
"label": ["Good", "Bad", "Good", "etc"],
|
295 |
-
}
|
296 |
-
)
|
297 |
-
|
298 |
-
st.subheader("Output format")
|
299 |
-
st.markdown(
|
300 |
-
"""
|
301 |
-
As a result of the process, you will get a file containing 4 columns:
|
302 |
-
- `Word`: the n-gram (i.e., a word or a concatenation of words) considered
|
303 |
-
- `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
|
304 |
-
- `Label`: the label that `Word` is discriminating
|
305 |
-
- `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
|
306 |
-
|
307 |
-
for example
|
308 |
-
"""
|
309 |
-
)
|
310 |
-
|
311 |
-
st.table(
|
312 |
-
{
|
313 |
-
"Word": ["good", "awful", "bad service", "etc"],
|
314 |
-
"Score": ["0.52", "0.49", "0.35", "etc"],
|
315 |
-
"Label": ["Good", "Bad", "Good", "etc"],
|
316 |
-
"Correlation": ["positive", "positive", "negative", "etc"],
|
317 |
-
}
|
318 |
-
)
|
319 |
-
|
320 |
-
|
321 |
def footer():
|
322 |
st.sidebar.markdown(
|
323 |
"""
|
@@ -383,15 +459,11 @@ def analysis(outputs):
|
|
383 |
)
|
384 |
|
385 |
with st.expander("Vocabulary"):
|
386 |
-
st.markdown(
|
387 |
-
"The table below shows all candidate n-grams that Wordify considered"
|
388 |
-
)
|
389 |
st.write(meta_data["vocabulary"])
|
390 |
|
391 |
with st.expander("Labels"):
|
392 |
-
st.markdown(
|
393 |
-
"The table below summarizes the labels that your file contained"
|
394 |
-
)
|
395 |
st.write(meta_data["labels"])
|
396 |
|
397 |
return subset_df
|
@@ -421,6 +493,5 @@ def language_specific_warnings(pre_steps, post_steps, lemmatization_step, langua
|
|
421 |
"Chinese",
|
422 |
):
|
423 |
st.info(
|
424 |
-
msg
|
425 |
-
+ " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
|
426 |
)
|
|
|
65 |
pre_steps = st.multiselect(
|
66 |
"Select pre-lemmatization processing steps (ordered)",
|
67 |
options=steps_options,
|
68 |
+
default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value],
|
|
|
|
|
69 |
format_func=lambda x: x.replace("_", " ").title(),
|
70 |
help="Select the processing steps to apply before the text is lemmatized",
|
71 |
)
|
72 |
|
73 |
+
lammatization_options = list(PreprocessingPipeline.lemmatization_component().keys())
|
|
|
|
|
74 |
lemmatization_step = st.selectbox(
|
75 |
"Select lemmatization",
|
76 |
options=lammatization_options,
|
|
|
81 |
post_steps = st.multiselect(
|
82 |
"Select post-lemmatization processing steps (ordered)",
|
83 |
options=steps_options,
|
84 |
+
default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value],
|
|
|
|
|
|
|
85 |
format_func=lambda x: x.replace("_", " ").title(),
|
86 |
help="Select the processing steps to apply after the text is lemmatized",
|
87 |
)
|
|
|
93 |
start_time = time.time()
|
94 |
|
95 |
# warnings about inputs
|
96 |
+
language_specific_warnings(pre_steps, post_steps, lemmatization_step, language)
|
|
|
|
|
97 |
|
98 |
# preprocess
|
99 |
if not disable_preprocessing:
|
100 |
with st.spinner("Step 1/4: Preprocessing text"):
|
101 |
+
pipe = PreprocessingPipeline(language, pre_steps, lemmatization_step, post_steps)
|
|
|
|
|
102 |
df = pipe.vaex_process(df, text_column)
|
103 |
else:
|
104 |
+
with st.spinner("Step 1/4: Preprocessing has been disabled - doing nothing"):
|
105 |
+
df = df.rename(columns={text_column: ColumnNames.PROCESSED_TEXT.value})
|
|
|
|
|
|
|
|
|
106 |
time.sleep(1.2)
|
107 |
|
108 |
# prepare input
|
109 |
with st.spinner("Step 2/4: Preparing inputs"):
|
110 |
+
input_dict = input_transform(df[ColumnNames.PROCESSED_TEXT.value], df[label_column])
|
|
|
|
|
111 |
|
112 |
# wordify
|
113 |
with st.spinner("Step 3/4: Wordifying"):
|
|
|
129 |
return new_df, meta_data
|
130 |
|
131 |
|
132 |
+
def presentation():
|
133 |
+
st.markdown(
|
134 |
+
"""
|
135 |
+
Wordify makes it easy to identify words that discriminate categories in textual data.
|
136 |
+
It was proposed by Dirk Hovy, Shiri Melumad, and Jeffrey J Inman in
|
137 |
+
[Wordify: A Tool for Discovering and Differentiating Consumer Vocabularies](https://academic.oup.com/jcr/article/48/3/394/6199426).
|
138 |
+
|
139 |
+
:point_left: Start by uploading a file. *Once you upload the file, __Wordify__ will
|
140 |
+
show an interactive UI*.
|
141 |
+
"""
|
142 |
+
)
|
143 |
+
|
144 |
+
st.subheader("Quickstart")
|
145 |
+
st.markdown(
|
146 |
+
"""
|
147 |
+
- There is no need to preprocess your text, we will take care of it. However, if you wish to
|
148 |
+
do so, turn off preprocessing in the `Advanced Settings` in the interactive UI.
|
149 |
+
|
150 |
+
- We expect a file with two columns: `label` with the labels and `text` with the texts (the names are case insensitive). If
|
151 |
+
you provide a file following this naming convention, Wordify will automatically select the
|
152 |
+
correct columns. However, if you wish to use a different nomenclature, you will be asked to
|
153 |
+
provide the column names in the interactive UI.
|
154 |
+
|
155 |
+
- Maintain a stable connection with the Wordify page until you download the data. If you refresh the page,
|
156 |
+
a new Wordify session is created and your progress is lost.
|
157 |
+
|
158 |
+
- Wordify performances depend on the length of the individual texts in your file. The longer the texts, the higher
|
159 |
+
the chance that Wordify considers many n-grams. More n-grams means more data to analyse in each run.
|
160 |
+
We tailored Wordify performance for files of approximately 5'000 lines or 50k n-grams. In such cases we expect a runtime
|
161 |
+
between 90 seconds and 10 minutes. If your file is big, try to apply a stricter preprocessing of the text in the `Advanced Options` section.
|
162 |
+
If this is not enough, please do feel free to reach out to us directly so we can help.
|
163 |
+
"""
|
164 |
+
)
|
165 |
+
|
166 |
+
how_to_use()
|
167 |
+
how_it_works()
|
168 |
+
|
169 |
+
|
170 |
+
def how_to_use():
|
171 |
+
with st.expander("How to use Wordify"):
|
172 |
+
|
173 |
+
st.subheader("Input format")
|
174 |
+
st.markdown(
|
175 |
+
"""
|
176 |
+
Please note that your file must have a column with the texts and a column with the labels,
|
177 |
+
for example
|
178 |
+
"""
|
179 |
+
)
|
180 |
+
st.table(
|
181 |
+
{
|
182 |
+
"text": ["A review", "Another review", "Yet another one", "etc"],
|
183 |
+
"label": ["Good", "Bad", "Good", "etc"],
|
184 |
+
}
|
185 |
+
)
|
186 |
+
|
187 |
+
st.subheader("Output format")
|
188 |
+
st.markdown(
|
189 |
+
"""
|
190 |
+
As a result of the process, you will get a file containing 4 columns:
|
191 |
+
- `Word`: the n-gram (i.e., a word or a concatenation of words) considered
|
192 |
+
- `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
|
193 |
+
- `Label`: the label that `Word` is discriminating
|
194 |
+
- `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
|
195 |
+
|
196 |
+
for example
|
197 |
+
"""
|
198 |
+
)
|
199 |
+
|
200 |
+
st.table(
|
201 |
+
{
|
202 |
+
"Word": ["good", "awful", "bad service", "etc"],
|
203 |
+
"Score": ["0.52", "0.49", "0.35", "etc"],
|
204 |
+
"Label": ["Good", "Bad", "Good", "etc"],
|
205 |
+
"Correlation": ["positive", "positive", "negative", "etc"],
|
206 |
+
}
|
207 |
+
)
|
208 |
+
|
209 |
+
|
210 |
+
def how_it_works():
|
211 |
+
table2 = pd.DataFrame(
|
212 |
+
{
|
213 |
+
"Text": [
|
214 |
+
"Spice light wine",
|
215 |
+
"Wine oak heavy",
|
216 |
+
"Chardonnay buttery light",
|
217 |
+
"Wine light cherry",
|
218 |
+
"Chardonnay wine oak buttery",
|
219 |
+
],
|
220 |
+
"Label": ["Italy", "United States", "United States", "Italy", "United States"],
|
221 |
+
}
|
222 |
+
)
|
223 |
+
|
224 |
+
table3 = pd.DataFrame(
|
225 |
+
{
|
226 |
+
"Model": [1, 2, 3, 4],
|
227 |
+
"Buttery": [0.32, 0, 0, 0],
|
228 |
+
"Chardonnay": [3.78, 0, 0, 0],
|
229 |
+
"Cherry": [-2.49, 0, 0, -6.2],
|
230 |
+
"Heavy": [0, 3.62, 0, 0],
|
231 |
+
"Light": [-1.72, -4.38, 0, 0],
|
232 |
+
"Oak": [0, 0, 0, 0],
|
233 |
+
"Spice": [-2.49, 0, -6.2, 0],
|
234 |
+
"Wine": [0, 0, 0, 0],
|
235 |
+
},
|
236 |
+
dtype=str,
|
237 |
+
)
|
238 |
+
|
239 |
+
table4 = pd.DataFrame(
|
240 |
+
{
|
241 |
+
"Coefficient valence": ["positive", "negative"],
|
242 |
+
"Buttery": [0.25, 0],
|
243 |
+
"Chardonnay": [0.25, 0],
|
244 |
+
"Cherry": [0, 0.5],
|
245 |
+
"Heavy": [0.25, 0],
|
246 |
+
"Light": [0, 0.5],
|
247 |
+
"Oak": [0, 0],
|
248 |
+
"Spice": [0, 0.5],
|
249 |
+
"Wine": [0, 0],
|
250 |
+
},
|
251 |
+
dtype=str,
|
252 |
+
)
|
253 |
+
|
254 |
+
with st.expander("How Wordify works: an illustrative example"):
|
255 |
+
st.markdown(
|
256 |
+
f"""
|
257 |
+
To provide an intuitive example of how Wordify works, imagine we have the following five documents with hypothetical
|
258 |
+
descriptions of wines from the United States and Italy listed in table 2 (preprocessed to remove noise words).
|
259 |
+
"""
|
260 |
+
)
|
261 |
+
st.caption("Table 2: Descriptions of wines from the USA and Italy.")
|
262 |
+
st.table(table2)
|
263 |
+
|
264 |
+
st.markdown(
|
265 |
+
"""
|
266 |
+
Wordify now draws, say, four independent samples from this data, for example: `(1,3,4,5)`, `(1,2,2,4)`, `(1,1,2,3)`, and `(2,3,4,4)`.
|
267 |
+
We fit an L1-regularized Logistic Regression on each, with the United States as target class. This result in the following sparse
|
268 |
+
vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
|
269 |
+
"""
|
270 |
+
)
|
271 |
+
st.caption("Table 3: Coefficients for frequency of indicators in each of the four runs for US wines.")
|
272 |
+
st.table(table3)
|
273 |
+
|
274 |
+
st.markdown(
|
275 |
+
"""
|
276 |
+
We can now count for each indicator how many times out of the four runs it received a non-zero coefficient (the magnitude does not matter).
|
277 |
+
We distinguish by positive and negative coefficients, and divide the result by the number of runs (here, four), which yields the final indicators
|
278 |
+
that are positively and negatively correlated with the US wines.
|
279 |
+
"""
|
280 |
+
)
|
281 |
+
st.caption("Table 4: Final set of indicators that are positively versus negatively correlated with US wines.")
|
282 |
+
st.table(table4)
|
283 |
+
st.markdown(
|
284 |
+
"""
|
285 |
+
The results of table 4 suggest that a wine is likely to be from the United States if its description contains any of the following words: "buttery",
|
286 |
+
"chardonnay", or "heavy", and these words are similarly discriminative. In contrast, a wine is likely to not be from the United States if it contains
|
287 |
+
the words "spice", "light", or "cherry". It is also worth noting that "oak" and "wine", which were present for both Italian and US wines, were ultimately
|
288 |
+
not selected as discriminative indicators of US wines. Finally, we would conduct an analogous analysis with Italy as the target class to determine which
|
289 |
+
indicators are most and least discriminative of Italian wines.
|
290 |
+
"""
|
291 |
+
)
|
292 |
+
|
293 |
+
|
294 |
def faq():
|
295 |
st.subheader("Frequently Asked Questions")
|
296 |
with st.expander("What is Wordify?"):
|
|
|
394 |
st.markdown(contacts(), unsafe_allow_html=True)
|
395 |
|
396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
def footer():
|
398 |
st.sidebar.markdown(
|
399 |
"""
|
|
|
459 |
)
|
460 |
|
461 |
with st.expander("Vocabulary"):
|
462 |
+
st.markdown("The table below shows all candidate n-grams that Wordify considered")
|
|
|
|
|
463 |
st.write(meta_data["vocabulary"])
|
464 |
|
465 |
with st.expander("Labels"):
|
466 |
+
st.markdown("The table below summarizes the labels that your file contained")
|
|
|
|
|
467 |
st.write(meta_data["labels"])
|
468 |
|
469 |
return subset_df
|
|
|
493 |
"Chinese",
|
494 |
):
|
495 |
st.info(
|
496 |
+
msg + " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
|
|
|
497 |
)
|