Pietro Lesci commited on
Commit
ab15c62
·
1 Parent(s): c700823

reformat code

Browse files
Files changed (2) hide show
  1. src/pages/home.py +14 -4
  2. src/preprocessing.py +14 -6
src/pages/home.py CHANGED
@@ -45,7 +45,9 @@ def write(session, uploaded_file):
45
  )
46
  with col2:
47
  cols_options = [""] + data.columns.tolist()
48
- label_column = st.selectbox("Select label column name", cols_options, index=0)
 
 
49
  with st.beta_expander("Description"):
50
  st.markdown("Select the column containing the labels.")
51
 
@@ -60,7 +62,9 @@ def write(session, uploaded_file):
60
  st.markdown("Select the column containing the texts.")
61
 
62
  if text_column:
63
- st.altair_chart(plot_nchars(data, text_column), use_container_width=True)
 
 
64
 
65
  # ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
66
  with st.beta_expander("Advanced options"):
@@ -151,7 +155,11 @@ def write(session, uploaded_file):
151
  sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
152
  sample_data[text_column]
153
  ).values
154
- st.table(sample_data.loc[:, [label_column, text_column, f"preprocessed_{text_column}"]])
 
 
 
 
155
 
156
  # ==== 4. RUN ==== #
157
  run_button = st.button("Wordify!")
@@ -183,7 +191,9 @@ def write(session, uploaded_file):
183
  col1, col2, col3 = st.beta_columns([2, 3, 3])
184
 
185
  with col1:
186
- label = st.selectbox("Select label", data[label_column].unique().tolist())
 
 
187
  # # with col2:
188
  # thres = st.slider(
189
  # "Select threshold",
 
45
  )
46
  with col2:
47
  cols_options = [""] + data.columns.tolist()
48
+ label_column = st.selectbox(
49
+ "Select label column name", cols_options, index=0
50
+ )
51
  with st.beta_expander("Description"):
52
  st.markdown("Select the column containing the labels.")
53
 
 
62
  st.markdown("Select the column containing the texts.")
63
 
64
  if text_column:
65
+ st.altair_chart(
66
+ plot_nchars(data, text_column), use_container_width=True
67
+ )
68
 
69
  # ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
70
  with st.beta_expander("Advanced options"):
 
155
  sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
156
  sample_data[text_column]
157
  ).values
158
+ st.table(
159
+ sample_data.loc[
160
+ :, [label_column, text_column, f"preprocessed_{text_column}"]
161
+ ]
162
+ )
163
 
164
  # ==== 4. RUN ==== #
165
  run_button = st.button("Wordify!")
 
191
  col1, col2, col3 = st.beta_columns([2, 3, 3])
192
 
193
  with col1:
194
+ label = st.selectbox(
195
+ "Select label", data[label_column].unique().tolist()
196
+ )
197
  # # with col2:
198
  # thres = st.slider(
199
  # "Select threshold",
src/preprocessing.py CHANGED
@@ -91,7 +91,9 @@ def normalize_repeating_words(t):
91
  class Lemmatizer:
92
  """Creates lemmatizer based on spacy"""
93
 
94
- def __init__(self, language: str, remove_stop: bool = True, lemmatization: bool = True) -> None:
 
 
95
  self.language = language
96
  self.nlp = spacy.load(
97
  Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
@@ -99,12 +101,16 @@ class Lemmatizer:
99
  self._lemmatizer_fn = self._get_lemmatization_fn(remove_stop, lemmatization)
100
  self.lemmatization = lemmatization
101
 
102
- def _get_lemmatization_fn(self, remove_stop: bool, lemmatization: bool) -> Optional[Callable]:
 
 
103
  """Return the correct spacy Doc-level lemmatizer"""
104
  if remove_stop and lemmatization:
105
 
106
  def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
107
- return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop])
 
 
108
 
109
  elif remove_stop and not lemmatization:
110
 
@@ -136,7 +142,9 @@ class Lemmatizer:
136
 
137
 
138
  class PreprocessingPipeline:
139
- def __init__(self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]):
 
 
140
 
141
  # build pipeline
142
  self.pre_pipeline, self.lemmatizer, self.post_pipeline = self.make_pipeline(
@@ -146,10 +154,10 @@ class PreprocessingPipeline:
146
  def __call__(self, series: Series) -> Series:
147
  with st.spinner("Pre-lemmatization cleaning"):
148
  res = series.progress_map(self.pre_pipeline)
149
-
150
  with st.spinner("Lemmatizing"):
151
  res = self.lemmatizer(series)
152
-
153
  with st.spinner("Post-lemmatization cleaning"):
154
  res = series.progress_map(self.post_pipeline)
155
 
 
91
  class Lemmatizer:
92
  """Creates lemmatizer based on spacy"""
93
 
94
+ def __init__(
95
+ self, language: str, remove_stop: bool = True, lemmatization: bool = True
96
+ ) -> None:
97
  self.language = language
98
  self.nlp = spacy.load(
99
  Languages[language].value, exclude=["parser", "ner", "pos", "tok2vec"]
 
101
  self._lemmatizer_fn = self._get_lemmatization_fn(remove_stop, lemmatization)
102
  self.lemmatization = lemmatization
103
 
104
+ def _get_lemmatization_fn(
105
+ self, remove_stop: bool, lemmatization: bool
106
+ ) -> Optional[Callable]:
107
  """Return the correct spacy Doc-level lemmatizer"""
108
  if remove_stop and lemmatization:
109
 
110
  def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
111
+ return " ".join(
112
+ [t.lemma_ for t in doc if t.lemma_ != "-PRON-" and not t.is_stop]
113
+ )
114
 
115
  elif remove_stop and not lemmatization:
116
 
 
142
 
143
 
144
  class PreprocessingPipeline:
145
+ def __init__(
146
+ self, pre_steps: List[str], lemmatizer: Lemmatizer, post_steps: List[str]
147
+ ):
148
 
149
  # build pipeline
150
  self.pre_pipeline, self.lemmatizer, self.post_pipeline = self.make_pipeline(
 
154
  def __call__(self, series: Series) -> Series:
155
  with st.spinner("Pre-lemmatization cleaning"):
156
  res = series.progress_map(self.pre_pipeline)
157
+
158
  with st.spinner("Lemmatizing"):
159
  res = self.lemmatizer(series)
160
+
161
  with st.spinner("Post-lemmatization cleaning"):
162
  res = series.progress_map(self.post_pipeline)
163