Pietro Lesci commited on
Commit
54434cf
·
1 Parent(s): 9608f9f

add description

Browse files
Files changed (1) hide show
  1. src/components.py +173 -102
src/components.py CHANGED
@@ -65,16 +65,12 @@ def form(df):
65
  pre_steps = st.multiselect(
66
  "Select pre-lemmatization processing steps (ordered)",
67
  options=steps_options,
68
- default=[
69
- steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
70
- ],
71
  format_func=lambda x: x.replace("_", " ").title(),
72
  help="Select the processing steps to apply before the text is lemmatized",
73
  )
74
 
75
- lammatization_options = list(
76
- PreprocessingPipeline.lemmatization_component().keys()
77
- )
78
  lemmatization_step = st.selectbox(
79
  "Select lemmatization",
80
  options=lammatization_options,
@@ -85,10 +81,7 @@ def form(df):
85
  post_steps = st.multiselect(
86
  "Select post-lemmatization processing steps (ordered)",
87
  options=steps_options,
88
- default=[
89
- steps_options[i]
90
- for i in PreprocessingConfigs.DEFAULT_POST.value
91
- ],
92
  format_func=lambda x: x.replace("_", " ").title(),
93
  help="Select the processing steps to apply after the text is lemmatized",
94
  )
@@ -100,31 +93,21 @@ def form(df):
100
  start_time = time.time()
101
 
102
  # warnings about inputs
103
- language_specific_warnings(
104
- pre_steps, post_steps, lemmatization_step, language
105
- )
106
 
107
  # preprocess
108
  if not disable_preprocessing:
109
  with st.spinner("Step 1/4: Preprocessing text"):
110
- pipe = PreprocessingPipeline(
111
- language, pre_steps, lemmatization_step, post_steps
112
- )
113
  df = pipe.vaex_process(df, text_column)
114
  else:
115
- with st.spinner(
116
- "Step 1/4: Preprocessing has been disabled - doing nothing"
117
- ):
118
- df = df.rename(
119
- columns={text_column: ColumnNames.PROCESSED_TEXT.value}
120
- )
121
  time.sleep(1.2)
122
 
123
  # prepare input
124
  with st.spinner("Step 2/4: Preparing inputs"):
125
- input_dict = input_transform(
126
- df[ColumnNames.PROCESSED_TEXT.value], df[label_column]
127
- )
128
 
129
  # wordify
130
  with st.spinner("Step 3/4: Wordifying"):
@@ -146,6 +129,168 @@ def form(df):
146
  return new_df, meta_data
147
 
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  def faq():
150
  st.subheader("Frequently Asked Questions")
151
  with st.expander("What is Wordify?"):
@@ -249,75 +394,6 @@ def faq():
249
  st.markdown(contacts(), unsafe_allow_html=True)
250
 
251
 
252
- def presentation():
253
- st.markdown(
254
- """
255
- Wordify makes it easy to identify words that discriminate categories in textual data.
256
-
257
- :point_left: Start by uploading a file. *Once you upload the file, __Wordify__ will
258
- show an interactive UI*.
259
- """
260
- )
261
-
262
- st.subheader("Quickstart")
263
- st.markdown(
264
- """
265
- - There is no need to preprocess your text, we will take care of it. However, if you wish to
266
- do so, turn off preprocessing in the `Advanced Settings` in the interactive UI.
267
-
268
- - We expect a file with two columns: `label` with the labels and `text` with the texts (the names are case insensitive). If
269
- you provide a file following this naming convention, Wordify will automatically select the
270
- correct columns. However, if you wish to use a different nomenclature, you will be asked to
271
- provide the column names in the interactive UI.
272
-
273
- - Maintain a stable connection with the Wordify page until you download the data. If you refresh the page,
274
- a new Wordify session is created and your progress is lost.
275
-
276
- - Wordify performances depend on the length of the individual texts in your file. The longer the texts, the higher
277
- the chance that Wordify considers many n-grams. More n-grams means more data to analyse in each run.
278
- We tailored Wordify performance for files of approximately 5'000 lines or 50k n-grams. In such cases we expect a runtime
279
- between 90 seconds and 10 minutes. If your file is big, try to apply a stricter preprocessing of the text in the `Advanced Options` section.
280
- If this is not enough, please do feel free to reach out to us directly so we can help.
281
- """
282
- )
283
-
284
- st.subheader("Input format")
285
- st.markdown(
286
- """
287
- Please note that your file must have a column with the texts and a column with the labels,
288
- for example
289
- """
290
- )
291
- st.table(
292
- {
293
- "text": ["A review", "Another review", "Yet another one", "etc"],
294
- "label": ["Good", "Bad", "Good", "etc"],
295
- }
296
- )
297
-
298
- st.subheader("Output format")
299
- st.markdown(
300
- """
301
- As a result of the process, you will get a file containing 4 columns:
302
- - `Word`: the n-gram (i.e., a word or a concatenation of words) considered
303
- - `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
304
- - `Label`: the label that `Word` is discriminating
305
- - `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
306
-
307
- for example
308
- """
309
- )
310
-
311
- st.table(
312
- {
313
- "Word": ["good", "awful", "bad service", "etc"],
314
- "Score": ["0.52", "0.49", "0.35", "etc"],
315
- "Label": ["Good", "Bad", "Good", "etc"],
316
- "Correlation": ["positive", "positive", "negative", "etc"],
317
- }
318
- )
319
-
320
-
321
  def footer():
322
  st.sidebar.markdown(
323
  """
@@ -383,15 +459,11 @@ def analysis(outputs):
383
  )
384
 
385
  with st.expander("Vocabulary"):
386
- st.markdown(
387
- "The table below shows all candidate n-grams that Wordify considered"
388
- )
389
  st.write(meta_data["vocabulary"])
390
 
391
  with st.expander("Labels"):
392
- st.markdown(
393
- "The table below summarizes the labels that your file contained"
394
- )
395
  st.write(meta_data["labels"])
396
 
397
  return subset_df
@@ -421,6 +493,5 @@ def language_specific_warnings(pre_steps, post_steps, lemmatization_step, langua
421
  "Chinese",
422
  ):
423
  st.info(
424
- msg
425
- + " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
426
  )
 
65
  pre_steps = st.multiselect(
66
  "Select pre-lemmatization processing steps (ordered)",
67
  options=steps_options,
68
+ default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value],
 
 
69
  format_func=lambda x: x.replace("_", " ").title(),
70
  help="Select the processing steps to apply before the text is lemmatized",
71
  )
72
 
73
+ lammatization_options = list(PreprocessingPipeline.lemmatization_component().keys())
 
 
74
  lemmatization_step = st.selectbox(
75
  "Select lemmatization",
76
  options=lammatization_options,
 
81
  post_steps = st.multiselect(
82
  "Select post-lemmatization processing steps (ordered)",
83
  options=steps_options,
84
+ default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value],
 
 
 
85
  format_func=lambda x: x.replace("_", " ").title(),
86
  help="Select the processing steps to apply after the text is lemmatized",
87
  )
 
93
  start_time = time.time()
94
 
95
  # warnings about inputs
96
+ language_specific_warnings(pre_steps, post_steps, lemmatization_step, language)
 
 
97
 
98
  # preprocess
99
  if not disable_preprocessing:
100
  with st.spinner("Step 1/4: Preprocessing text"):
101
+ pipe = PreprocessingPipeline(language, pre_steps, lemmatization_step, post_steps)
 
 
102
  df = pipe.vaex_process(df, text_column)
103
  else:
104
+ with st.spinner("Step 1/4: Preprocessing has been disabled - doing nothing"):
105
+ df = df.rename(columns={text_column: ColumnNames.PROCESSED_TEXT.value})
 
 
 
 
106
  time.sleep(1.2)
107
 
108
  # prepare input
109
  with st.spinner("Step 2/4: Preparing inputs"):
110
+ input_dict = input_transform(df[ColumnNames.PROCESSED_TEXT.value], df[label_column])
 
 
111
 
112
  # wordify
113
  with st.spinner("Step 3/4: Wordifying"):
 
129
  return new_df, meta_data
130
 
131
 
132
+ def presentation():
133
+ st.markdown(
134
+ """
135
+ Wordify makes it easy to identify words that discriminate categories in textual data.
136
+ It was proposed by Dirk Hovy, Shiri Melumad, and Jeffrey J Inman in
137
+ [Wordify: A Tool for Discovering and Differentiating Consumer Vocabularies](https://academic.oup.com/jcr/article/48/3/394/6199426).
138
+
139
+ :point_left: Start by uploading a file. *Once you upload the file, __Wordify__ will
140
+ show an interactive UI*.
141
+ """
142
+ )
143
+
144
+ st.subheader("Quickstart")
145
+ st.markdown(
146
+ """
147
+ - There is no need to preprocess your text, we will take care of it. However, if you wish to
148
+ do so, turn off preprocessing in the `Advanced Settings` in the interactive UI.
149
+
150
+ - We expect a file with two columns: `label` with the labels and `text` with the texts (the names are case insensitive). If
151
+ you provide a file following this naming convention, Wordify will automatically select the
152
+ correct columns. However, if you wish to use a different nomenclature, you will be asked to
153
+ provide the column names in the interactive UI.
154
+
155
+ - Maintain a stable connection with the Wordify page until you download the data. If you refresh the page,
156
+ a new Wordify session is created and your progress is lost.
157
+
158
+ - Wordify performances depend on the length of the individual texts in your file. The longer the texts, the higher
159
+ the chance that Wordify considers many n-grams. More n-grams means more data to analyse in each run.
160
+ We tailored Wordify performance for files of approximately 5'000 lines or 50k n-grams. In such cases we expect a runtime
161
+ between 90 seconds and 10 minutes. If your file is big, try to apply a stricter preprocessing of the text in the `Advanced Options` section.
162
+ If this is not enough, please do feel free to reach out to us directly so we can help.
163
+ """
164
+ )
165
+
166
+ how_to_use()
167
+ how_it_works()
168
+
169
+
170
+ def how_to_use():
171
+ with st.expander("How to use Wordify"):
172
+
173
+ st.subheader("Input format")
174
+ st.markdown(
175
+ """
176
+ Please note that your file must have a column with the texts and a column with the labels,
177
+ for example
178
+ """
179
+ )
180
+ st.table(
181
+ {
182
+ "text": ["A review", "Another review", "Yet another one", "etc"],
183
+ "label": ["Good", "Bad", "Good", "etc"],
184
+ }
185
+ )
186
+
187
+ st.subheader("Output format")
188
+ st.markdown(
189
+ """
190
+ As a result of the process, you will get a file containing 4 columns:
191
+ - `Word`: the n-gram (i.e., a word or a concatenation of words) considered
192
+ - `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
193
+ - `Label`: the label that `Word` is discriminating
194
+ - `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
195
+
196
+ for example
197
+ """
198
+ )
199
+
200
+ st.table(
201
+ {
202
+ "Word": ["good", "awful", "bad service", "etc"],
203
+ "Score": ["0.52", "0.49", "0.35", "etc"],
204
+ "Label": ["Good", "Bad", "Good", "etc"],
205
+ "Correlation": ["positive", "positive", "negative", "etc"],
206
+ }
207
+ )
208
+
209
+
210
+ def how_it_works():
211
+ table2 = pd.DataFrame(
212
+ {
213
+ "Text": [
214
+ "Spice light wine",
215
+ "Wine oak heavy",
216
+ "Chardonnay buttery light",
217
+ "Wine light cherry",
218
+ "Chardonnay wine oak buttery",
219
+ ],
220
+ "Label": ["Italy", "United States", "United States", "Italy", "United States"],
221
+ }
222
+ )
223
+
224
+ table3 = pd.DataFrame(
225
+ {
226
+ "Model": [1, 2, 3, 4],
227
+ "Buttery": [0.32, 0, 0, 0],
228
+ "Chardonnay": [3.78, 0, 0, 0],
229
+ "Cherry": [-2.49, 0, 0, -6.2],
230
+ "Heavy": [0, 3.62, 0, 0],
231
+ "Light": [-1.72, -4.38, 0, 0],
232
+ "Oak": [0, 0, 0, 0],
233
+ "Spice": [-2.49, 0, -6.2, 0],
234
+ "Wine": [0, 0, 0, 0],
235
+ },
236
+ dtype=str,
237
+ )
238
+
239
+ table4 = pd.DataFrame(
240
+ {
241
+ "Coefficient valence": ["positive", "negative"],
242
+ "Buttery": [0.25, 0],
243
+ "Chardonnay": [0.25, 0],
244
+ "Cherry": [0, 0.5],
245
+ "Heavy": [0.25, 0],
246
+ "Light": [0, 0.5],
247
+ "Oak": [0, 0],
248
+ "Spice": [0, 0.5],
249
+ "Wine": [0, 0],
250
+ },
251
+ dtype=str,
252
+ )
253
+
254
+ with st.expander("How Wordify works: an illustrative example"):
255
+ st.markdown(
256
+ f"""
257
+ To provide an intuitive example of how Wordify works, imagine we have the following five documents with hypothetical
258
+ descriptions of wines from the United States and Italy listed in table 2 (preprocessed to remove noise words).
259
+ """
260
+ )
261
+ st.caption("Table 2: Descriptions of wines from the USA and Italy.")
262
+ st.table(table2)
263
+
264
+ st.markdown(
265
+ """
266
+ Wordify now draws, say, four independent samples from this data, for example: `(1,3,4,5)`, `(1,2,2,4)`, `(1,1,2,3)`, and `(2,3,4,4)`.
267
+ We fit an L1-regularized Logistic Regression on each, with the United States as target class. This result in the following sparse
268
+ vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
269
+ """
270
+ )
271
+ st.caption("Table 3: Coefficients for frequency of indicators in each of the four runs for US wines.")
272
+ st.table(table3)
273
+
274
+ st.markdown(
275
+ """
276
+ We can now count for each indicator how many times out of the four runs it received a non-zero coefficient (the magnitude does not matter).
277
+ We distinguish by positive and negative coefficients, and divide the result by the number of runs (here, four), which yields the final indicators
278
+ that are positively and negatively correlated with the US wines.
279
+ """
280
+ )
281
+ st.caption("Table 4: Final set of indicators that are positively versus negatively correlated with US wines.")
282
+ st.table(table4)
283
+ st.markdown(
284
+ """
285
+ The results of table 4 suggest that a wine is likely to be from the United States if its description contains any of the following words: "buttery",
286
+ "chardonnay", or "heavy", and these words are similarly discriminative. In contrast, a wine is likely to not be from the United States if it contains
287
+ the words "spice", "light", or "cherry". It is also worth noting that "oak" and "wine", which were present for both Italian and US wines, were ultimately
288
+ not selected as discriminative indicators of US wines. Finally, we would conduct an analogous analysis with Italy as the target class to determine which
289
+ indicators are most and least discriminative of Italian wines.
290
+ """
291
+ )
292
+
293
+
294
  def faq():
295
  st.subheader("Frequently Asked Questions")
296
  with st.expander("What is Wordify?"):
 
394
  st.markdown(contacts(), unsafe_allow_html=True)
395
 
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  def footer():
398
  st.sidebar.markdown(
399
  """
 
459
  )
460
 
461
  with st.expander("Vocabulary"):
462
+ st.markdown("The table below shows all candidate n-grams that Wordify considered")
 
 
463
  st.write(meta_data["vocabulary"])
464
 
465
  with st.expander("Labels"):
466
+ st.markdown("The table below summarizes the labels that your file contained")
 
 
467
  st.write(meta_data["labels"])
468
 
469
  return subset_df
 
493
  "Chinese",
494
  ):
495
  st.info(
496
+ msg + " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
 
497
  )