Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
fdbadfe
1
Parent(s):
ca663e1
add missing typing
Browse files- src/preprocessing.py +11 -11
- src/utils.py +68 -67
src/preprocessing.py
CHANGED
@@ -19,22 +19,22 @@ from .configs import Languages
|
|
19 |
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
20 |
# fmt: off
|
21 |
_re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
|
22 |
-
def normalize_acronyms(t):
|
23 |
return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
|
24 |
|
25 |
|
26 |
_re_non_word = re.compile(r"\W")
|
27 |
-
def remove_non_word(t):
|
28 |
return _re_non_word.sub(" ", t)
|
29 |
|
30 |
|
31 |
_re_space = re.compile(r" {2,}")
|
32 |
-
def normalize_useless_spaces(t):
|
33 |
return _re_space.sub(" ", t)
|
34 |
|
35 |
|
36 |
_re_rep = re.compile(r"(\S)(\1{2,})")
|
37 |
-
def normalize_repeating_chars(t):
|
38 |
def _replace_rep(m):
|
39 |
c, cc = m.groups()
|
40 |
return c
|
@@ -43,7 +43,7 @@ def normalize_repeating_chars(t):
|
|
43 |
|
44 |
|
45 |
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
46 |
-
def normalize_repeating_words(t):
|
47 |
def _replace_wrep(m):
|
48 |
c, cc, e = m.groups()
|
49 |
return c
|
@@ -92,11 +92,10 @@ class PreprocessingPipeline:
|
|
92 |
self.post = self.make_pre_post_component(self.post_steps)
|
93 |
self.lemma = self.lemmatization_component()[self.lemmatization_step]
|
94 |
|
95 |
-
def apply_multiproc(fn, series):
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
return new_series
|
100 |
|
101 |
def vaex_process(self, df: DataFrame, text_column: str) -> DataFrame:
|
102 |
def fn(t):
|
@@ -106,8 +105,9 @@ class PreprocessingPipeline:
|
|
106 |
vdf["processed_text"] = vdf.apply(
|
107 |
fn, arguments=[vdf[text_column]], vectorize=False
|
108 |
)
|
|
|
109 |
|
110 |
-
return
|
111 |
|
112 |
def __call__(self, series: Series) -> Series:
|
113 |
if self.pre:
|
|
|
19 |
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
20 |
# fmt: off
|
21 |
_re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
|
22 |
+
def normalize_acronyms(t: str) -> str:
|
23 |
return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
|
24 |
|
25 |
|
26 |
_re_non_word = re.compile(r"\W")
|
27 |
+
def remove_non_word(t: str) -> str:
|
28 |
return _re_non_word.sub(" ", t)
|
29 |
|
30 |
|
31 |
_re_space = re.compile(r" {2,}")
|
32 |
+
def normalize_useless_spaces(t: str) -> str:
|
33 |
return _re_space.sub(" ", t)
|
34 |
|
35 |
|
36 |
_re_rep = re.compile(r"(\S)(\1{2,})")
|
37 |
+
def normalize_repeating_chars(t: str) -> str:
|
38 |
def _replace_rep(m):
|
39 |
c, cc = m.groups()
|
40 |
return c
|
|
|
43 |
|
44 |
|
45 |
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
46 |
+
def normalize_repeating_words(t: str) -> str:
|
47 |
def _replace_wrep(m):
|
48 |
c, cc, e = m.groups()
|
49 |
return c
|
|
|
92 |
self.post = self.make_pre_post_component(self.post_steps)
|
93 |
self.lemma = self.lemmatization_component()[self.lemmatization_step]
|
94 |
|
95 |
+
# def apply_multiproc(fn, series):
|
96 |
+
# with mp.Pool(mp.cpu_count()) as pool:
|
97 |
+
# new_series = pool.map(fn, series)
|
98 |
+
# return new_series
|
|
|
99 |
|
100 |
def vaex_process(self, df: DataFrame, text_column: str) -> DataFrame:
|
101 |
def fn(t):
|
|
|
105 |
vdf["processed_text"] = vdf.apply(
|
106 |
fn, arguments=[vdf[text_column]], vectorize=False
|
107 |
)
|
108 |
+
df = vdf.to_pandas_df()
|
109 |
|
110 |
+
return df
|
111 |
|
112 |
def __call__(self, series: Series) -> Series:
|
113 |
if self.pre:
|
src/utils.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
import base64
|
2 |
-
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
import streamlit as st
|
6 |
from PIL import Image
|
7 |
|
|
|
|
|
8 |
from .configs import SupportedFiles, ColumnNames
|
9 |
|
10 |
|
11 |
-
def get_col_indices(cols):
|
12 |
"""Ugly but works"""
|
13 |
cols = [i.lower() for i in cols]
|
14 |
try:
|
@@ -25,12 +26,12 @@ def get_col_indices(cols):
|
|
25 |
|
26 |
|
27 |
@st.cache
|
28 |
-
def get_logo(path):
|
29 |
return Image.open(path)
|
30 |
|
31 |
|
32 |
@st.experimental_memo
|
33 |
-
def read_file(uploaded_file) ->
|
34 |
file_type = uploaded_file.name.split(".")[-1]
|
35 |
read_fn = SupportedFiles[file_type].value[0]
|
36 |
df = read_fn(uploaded_file)
|
@@ -39,12 +40,12 @@ def read_file(uploaded_file) -> pd.DataFrame:
|
|
39 |
|
40 |
|
41 |
@st.cache
|
42 |
-
def convert_df(df):
|
43 |
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
44 |
return df.to_csv(index=False, sep=";").encode("utf-8")
|
45 |
|
46 |
|
47 |
-
def download_button(dataframe:
|
48 |
csv = dataframe.to_csv(index=False)
|
49 |
# some strings <-> bytes conversions necessary here
|
50 |
b64 = base64.b64encode(csv.encode()).decode()
|
@@ -52,79 +53,79 @@ def download_button(dataframe: pd.DataFrame, name: str):
|
|
52 |
st.write(href, unsafe_allow_html=True)
|
53 |
|
54 |
|
55 |
-
def plot_labels_prop(data:
|
56 |
|
57 |
-
|
58 |
|
59 |
-
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
|
69 |
-
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
|
93 |
-
|
94 |
|
95 |
|
96 |
-
def plot_nchars(data:
|
97 |
-
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
|
110 |
-
|
111 |
|
112 |
|
113 |
-
def plot_score(data:
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
|
130 |
-
|
|
|
1 |
import base64
|
2 |
+
from typing import List, Tuple
|
3 |
+
from pandas.core.frame import DataFrame
|
|
|
4 |
import streamlit as st
|
5 |
from PIL import Image
|
6 |
|
7 |
+
# import altair as alt
|
8 |
+
|
9 |
from .configs import SupportedFiles, ColumnNames
|
10 |
|
11 |
|
12 |
+
def get_col_indices(cols: List) -> Tuple[int, int]:
|
13 |
"""Ugly but works"""
|
14 |
cols = [i.lower() for i in cols]
|
15 |
try:
|
|
|
26 |
|
27 |
|
28 |
@st.cache
|
29 |
+
def get_logo(path: str) -> Image:
|
30 |
return Image.open(path)
|
31 |
|
32 |
|
33 |
@st.experimental_memo
|
34 |
+
def read_file(uploaded_file) -> DataFrame:
|
35 |
file_type = uploaded_file.name.split(".")[-1]
|
36 |
read_fn = SupportedFiles[file_type].value[0]
|
37 |
df = read_fn(uploaded_file)
|
|
|
40 |
|
41 |
|
42 |
@st.cache
|
43 |
+
def convert_df(df: DataFrame) -> bytes:
|
44 |
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
45 |
return df.to_csv(index=False, sep=";").encode("utf-8")
|
46 |
|
47 |
|
48 |
+
def download_button(dataframe: DataFrame, name: str) -> None:
|
49 |
csv = dataframe.to_csv(index=False)
|
50 |
# some strings <-> bytes conversions necessary here
|
51 |
b64 = base64.b64encode(csv.encode()).decode()
|
|
|
53 |
st.write(href, unsafe_allow_html=True)
|
54 |
|
55 |
|
56 |
+
# def plot_labels_prop(data: DataFrame, label_column: str):
|
57 |
|
58 |
+
# unique_value_limit = 100
|
59 |
|
60 |
+
# if data[label_column].nunique() > unique_value_limit:
|
61 |
|
62 |
+
# st.warning(
|
63 |
+
# f"""
|
64 |
+
# The column you selected has more than {unique_value_limit}.
|
65 |
+
# Are you sure it's the right column? If it is, please note that
|
66 |
+
# this will impact __Wordify__ performance.
|
67 |
+
# """
|
68 |
+
# )
|
69 |
|
70 |
+
# return
|
71 |
|
72 |
+
# source = (
|
73 |
+
# data[label_column]
|
74 |
+
# .value_counts()
|
75 |
+
# .reset_index()
|
76 |
+
# .rename(columns={"index": "Labels", label_column: "Counts"})
|
77 |
+
# )
|
78 |
+
# source["Props"] = source["Counts"] / source["Counts"].sum()
|
79 |
+
# source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
|
80 |
|
81 |
+
# bars = (
|
82 |
+
# alt.Chart(source)
|
83 |
+
# .mark_bar()
|
84 |
+
# .encode(
|
85 |
+
# x=alt.X("Labels:O", sort="-y"),
|
86 |
+
# y="Counts:Q",
|
87 |
+
# )
|
88 |
+
# )
|
89 |
|
90 |
+
# text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
|
91 |
+
# text="Proportions:O"
|
92 |
+
# )
|
93 |
|
94 |
+
# return (bars + text).properties(height=300)
|
95 |
|
96 |
|
97 |
+
# def plot_nchars(data: DataFrame, text_column: str):
|
98 |
+
# source = data[text_column].str.len().to_frame()
|
99 |
|
100 |
+
# plot = (
|
101 |
+
# alt.Chart(source)
|
102 |
+
# .mark_bar()
|
103 |
+
# .encode(
|
104 |
+
# alt.X(
|
105 |
+
# f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
|
106 |
+
# ),
|
107 |
+
# alt.Y("count()", axis=alt.Axis(title="")),
|
108 |
+
# )
|
109 |
+
# )
|
110 |
|
111 |
+
# return plot.properties(height=300)
|
112 |
|
113 |
|
114 |
+
# def plot_score(data: DataFrame, label_col: str, label: str):
|
115 |
|
116 |
+
# source = (
|
117 |
+
# data.loc[data[label_col] == label]
|
118 |
+
# .sort_values("score", ascending=False)
|
119 |
+
# .head(100)
|
120 |
+
# )
|
121 |
|
122 |
+
# plot = (
|
123 |
+
# alt.Chart(source)
|
124 |
+
# .mark_bar()
|
125 |
+
# .encode(
|
126 |
+
# y=alt.Y("word:O", sort="-x"),
|
127 |
+
# x="score:Q",
|
128 |
+
# )
|
129 |
+
# )
|
130 |
|
131 |
+
# return plot.properties(height=max(30 * source.shape[0], 50))
|