Pietro Lesci commited on
Commit
fdbadfe
·
1 Parent(s): ca663e1

add missing typing

Browse files
Files changed (2) hide show
  1. src/preprocessing.py +11 -11
  2. src/utils.py +68 -67
src/preprocessing.py CHANGED
@@ -19,22 +19,22 @@ from .configs import Languages
19
  # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
20
  # fmt: off
21
  _re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
22
- def normalize_acronyms(t):
23
  return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
24
 
25
 
26
  _re_non_word = re.compile(r"\W")
27
- def remove_non_word(t):
28
  return _re_non_word.sub(" ", t)
29
 
30
 
31
  _re_space = re.compile(r" {2,}")
32
- def normalize_useless_spaces(t):
33
  return _re_space.sub(" ", t)
34
 
35
 
36
  _re_rep = re.compile(r"(\S)(\1{2,})")
37
- def normalize_repeating_chars(t):
38
  def _replace_rep(m):
39
  c, cc = m.groups()
40
  return c
@@ -43,7 +43,7 @@ def normalize_repeating_chars(t):
43
 
44
 
45
  _re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
46
- def normalize_repeating_words(t):
47
  def _replace_wrep(m):
48
  c, cc, e = m.groups()
49
  return c
@@ -92,11 +92,10 @@ class PreprocessingPipeline:
92
  self.post = self.make_pre_post_component(self.post_steps)
93
  self.lemma = self.lemmatization_component()[self.lemmatization_step]
94
 
95
- def apply_multiproc(fn, series):
96
- with mp.Pool(mp.cpu_count()) as pool:
97
- new_series = pool.map(fn, series)
98
-
99
- return new_series
100
 
101
  def vaex_process(self, df: DataFrame, text_column: str) -> DataFrame:
102
  def fn(t):
@@ -106,8 +105,9 @@ class PreprocessingPipeline:
106
  vdf["processed_text"] = vdf.apply(
107
  fn, arguments=[vdf[text_column]], vectorize=False
108
  )
 
109
 
110
- return vdf.to_pandas_df()
111
 
112
  def __call__(self, series: Series) -> Series:
113
  if self.pre:
 
19
  # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
20
  # fmt: off
21
  _re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
22
+ def normalize_acronyms(t: str) -> str:
23
  return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
24
 
25
 
26
  _re_non_word = re.compile(r"\W")
27
+ def remove_non_word(t: str) -> str:
28
  return _re_non_word.sub(" ", t)
29
 
30
 
31
  _re_space = re.compile(r" {2,}")
32
+ def normalize_useless_spaces(t: str) -> str:
33
  return _re_space.sub(" ", t)
34
 
35
 
36
  _re_rep = re.compile(r"(\S)(\1{2,})")
37
+ def normalize_repeating_chars(t: str) -> str:
38
  def _replace_rep(m):
39
  c, cc = m.groups()
40
  return c
 
43
 
44
 
45
  _re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
46
+ def normalize_repeating_words(t: str) -> str:
47
  def _replace_wrep(m):
48
  c, cc, e = m.groups()
49
  return c
 
92
  self.post = self.make_pre_post_component(self.post_steps)
93
  self.lemma = self.lemmatization_component()[self.lemmatization_step]
94
 
95
+ # def apply_multiproc(fn, series):
96
+ # with mp.Pool(mp.cpu_count()) as pool:
97
+ # new_series = pool.map(fn, series)
98
+ # return new_series
 
99
 
100
  def vaex_process(self, df: DataFrame, text_column: str) -> DataFrame:
101
  def fn(t):
 
105
  vdf["processed_text"] = vdf.apply(
106
  fn, arguments=[vdf[text_column]], vectorize=False
107
  )
108
+ df = vdf.to_pandas_df()
109
 
110
+ return df
111
 
112
  def __call__(self, series: Series) -> Series:
113
  if self.pre:
src/utils.py CHANGED
@@ -1,14 +1,15 @@
1
  import base64
2
-
3
- import altair as alt
4
- import pandas as pd
5
  import streamlit as st
6
  from PIL import Image
7
 
 
 
8
  from .configs import SupportedFiles, ColumnNames
9
 
10
 
11
- def get_col_indices(cols):
12
  """Ugly but works"""
13
  cols = [i.lower() for i in cols]
14
  try:
@@ -25,12 +26,12 @@ def get_col_indices(cols):
25
 
26
 
27
  @st.cache
28
- def get_logo(path):
29
  return Image.open(path)
30
 
31
 
32
  @st.experimental_memo
33
- def read_file(uploaded_file) -> pd.DataFrame:
34
  file_type = uploaded_file.name.split(".")[-1]
35
  read_fn = SupportedFiles[file_type].value[0]
36
  df = read_fn(uploaded_file)
@@ -39,12 +40,12 @@ def read_file(uploaded_file) -> pd.DataFrame:
39
 
40
 
41
  @st.cache
42
- def convert_df(df):
43
  # IMPORTANT: Cache the conversion to prevent computation on every rerun
44
  return df.to_csv(index=False, sep=";").encode("utf-8")
45
 
46
 
47
- def download_button(dataframe: pd.DataFrame, name: str):
48
  csv = dataframe.to_csv(index=False)
49
  # some strings <-> bytes conversions necessary here
50
  b64 = base64.b64encode(csv.encode()).decode()
@@ -52,79 +53,79 @@ def download_button(dataframe: pd.DataFrame, name: str):
52
  st.write(href, unsafe_allow_html=True)
53
 
54
 
55
- def plot_labels_prop(data: pd.DataFrame, label_column: str):
56
 
57
- unique_value_limit = 100
58
 
59
- if data[label_column].nunique() > unique_value_limit:
60
 
61
- st.warning(
62
- f"""
63
- The column you selected has more than {unique_value_limit}.
64
- Are you sure it's the right column? If it is, please note that
65
- this will impact __Wordify__ performance.
66
- """
67
- )
68
 
69
- return
70
 
71
- source = (
72
- data[label_column]
73
- .value_counts()
74
- .reset_index()
75
- .rename(columns={"index": "Labels", label_column: "Counts"})
76
- )
77
- source["Props"] = source["Counts"] / source["Counts"].sum()
78
- source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
79
 
80
- bars = (
81
- alt.Chart(source)
82
- .mark_bar()
83
- .encode(
84
- x=alt.X("Labels:O", sort="-y"),
85
- y="Counts:Q",
86
- )
87
- )
88
 
89
- text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
90
- text="Proportions:O"
91
- )
92
 
93
- return (bars + text).properties(height=300)
94
 
95
 
96
- def plot_nchars(data: pd.DataFrame, text_column: str):
97
- source = data[text_column].str.len().to_frame()
98
 
99
- plot = (
100
- alt.Chart(source)
101
- .mark_bar()
102
- .encode(
103
- alt.X(
104
- f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
105
- ),
106
- alt.Y("count()", axis=alt.Axis(title="")),
107
- )
108
- )
109
 
110
- return plot.properties(height=300)
111
 
112
 
113
- def plot_score(data: pd.DataFrame, label_col: str, label: str):
114
 
115
- source = (
116
- data.loc[data[label_col] == label]
117
- .sort_values("score", ascending=False)
118
- .head(100)
119
- )
120
 
121
- plot = (
122
- alt.Chart(source)
123
- .mark_bar()
124
- .encode(
125
- y=alt.Y("word:O", sort="-x"),
126
- x="score:Q",
127
- )
128
- )
129
 
130
- return plot.properties(height=max(30 * source.shape[0], 50))
 
1
  import base64
2
+ from typing import List, Tuple
3
+ from pandas.core.frame import DataFrame
 
4
  import streamlit as st
5
  from PIL import Image
6
 
7
+ # import altair as alt
8
+
9
  from .configs import SupportedFiles, ColumnNames
10
 
11
 
12
+ def get_col_indices(cols: List) -> Tuple[int, int]:
13
  """Ugly but works"""
14
  cols = [i.lower() for i in cols]
15
  try:
 
26
 
27
 
28
  @st.cache
29
+ def get_logo(path: str) -> Image:
30
  return Image.open(path)
31
 
32
 
33
  @st.experimental_memo
34
+ def read_file(uploaded_file) -> DataFrame:
35
  file_type = uploaded_file.name.split(".")[-1]
36
  read_fn = SupportedFiles[file_type].value[0]
37
  df = read_fn(uploaded_file)
 
40
 
41
 
42
  @st.cache
43
+ def convert_df(df: DataFrame) -> bytes:
44
  # IMPORTANT: Cache the conversion to prevent computation on every rerun
45
  return df.to_csv(index=False, sep=";").encode("utf-8")
46
 
47
 
48
+ def download_button(dataframe: DataFrame, name: str) -> None:
49
  csv = dataframe.to_csv(index=False)
50
  # some strings <-> bytes conversions necessary here
51
  b64 = base64.b64encode(csv.encode()).decode()
 
53
  st.write(href, unsafe_allow_html=True)
54
 
55
 
56
+ # def plot_labels_prop(data: DataFrame, label_column: str):
57
 
58
+ # unique_value_limit = 100
59
 
60
+ # if data[label_column].nunique() > unique_value_limit:
61
 
62
+ # st.warning(
63
+ # f"""
64
+ # The column you selected has more than {unique_value_limit}.
65
+ # Are you sure it's the right column? If it is, please note that
66
+ # this will impact __Wordify__ performance.
67
+ # """
68
+ # )
69
 
70
+ # return
71
 
72
+ # source = (
73
+ # data[label_column]
74
+ # .value_counts()
75
+ # .reset_index()
76
+ # .rename(columns={"index": "Labels", label_column: "Counts"})
77
+ # )
78
+ # source["Props"] = source["Counts"] / source["Counts"].sum()
79
+ # source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
80
 
81
+ # bars = (
82
+ # alt.Chart(source)
83
+ # .mark_bar()
84
+ # .encode(
85
+ # x=alt.X("Labels:O", sort="-y"),
86
+ # y="Counts:Q",
87
+ # )
88
+ # )
89
 
90
+ # text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
91
+ # text="Proportions:O"
92
+ # )
93
 
94
+ # return (bars + text).properties(height=300)
95
 
96
 
97
+ # def plot_nchars(data: DataFrame, text_column: str):
98
+ # source = data[text_column].str.len().to_frame()
99
 
100
+ # plot = (
101
+ # alt.Chart(source)
102
+ # .mark_bar()
103
+ # .encode(
104
+ # alt.X(
105
+ # f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
106
+ # ),
107
+ # alt.Y("count()", axis=alt.Axis(title="")),
108
+ # )
109
+ # )
110
 
111
+ # return plot.properties(height=300)
112
 
113
 
114
+ # def plot_score(data: DataFrame, label_col: str, label: str):
115
 
116
+ # source = (
117
+ # data.loc[data[label_col] == label]
118
+ # .sort_values("score", ascending=False)
119
+ # .head(100)
120
+ # )
121
 
122
+ # plot = (
123
+ # alt.Chart(source)
124
+ # .mark_bar()
125
+ # .encode(
126
+ # y=alt.Y("word:O", sort="-x"),
127
+ # x="score:Q",
128
+ # )
129
+ # )
130
 
131
+ # return plot.properties(height=max(30 * source.shape[0], 50))