Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
e48d543
1
Parent(s):
b748dad
format and delete old
Browse files- app.py +6 -7
- main.py +6 -6
- src/components.py +28 -9
- src/configs.py +1 -0
- src/pages/about.py +0 -34
- src/pages/faq.py +0 -126
- src/pages/home.py +0 -240
- src/plotting.py +0 -84
- src/preprocessing.py +7 -3
- src/session_state.py +0 -121
- src/utils.py +18 -5
- src/wordifier.py +34 -10
app.py
CHANGED
@@ -1,12 +1,9 @@
|
|
1 |
import streamlit as st
|
2 |
-
|
3 |
from src import session_state
|
4 |
-
from src.pages import (
|
5 |
-
home,
|
6 |
-
faq,
|
7 |
-
about,
|
8 |
-
)
|
9 |
from src.configs import SupportedFiles
|
|
|
|
|
10 |
|
11 |
# app configs
|
12 |
st.set_page_config(
|
@@ -59,7 +56,9 @@ st.sidebar.markdown(
|
|
59 |
""",
|
60 |
unsafe_allow_html=True,
|
61 |
)
|
62 |
-
st.sidebar.info(
|
|
|
|
|
63 |
|
64 |
|
65 |
# ==== MAIN ==== #
|
|
|
1 |
import streamlit as st
|
2 |
+
|
3 |
from src import session_state
|
|
|
|
|
|
|
|
|
|
|
4 |
from src.configs import SupportedFiles
|
5 |
+
from src.pages import about, faq, home
|
6 |
+
from src.utils import get_logo
|
7 |
|
8 |
# app configs
|
9 |
st.set_page_config(
|
|
|
56 |
""",
|
57 |
unsafe_allow_html=True,
|
58 |
)
|
59 |
+
st.sidebar.info(
|
60 |
+
"Something not working? Consider [filing an issue](https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new)"
|
61 |
+
)
|
62 |
|
63 |
|
64 |
# ==== MAIN ==== #
|
main.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
-
from src.utils import get_logo, read_file, convert_df
|
3 |
-
from src.components import form, faq, presentation, footer, about
|
4 |
|
|
|
|
|
5 |
|
6 |
# app configs
|
7 |
st.set_page_config(
|
@@ -10,10 +10,10 @@ st.set_page_config(
|
|
10 |
layout="centered",
|
11 |
page_icon="./assets/logo.png",
|
12 |
menu_items={
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
}
|
17 |
)
|
18 |
|
19 |
# logo
|
|
|
1 |
import streamlit as st
|
|
|
|
|
2 |
|
3 |
+
from src.components import about, faq, footer, form, presentation
|
4 |
+
from src.utils import convert_df, get_logo, read_file
|
5 |
|
6 |
# app configs
|
7 |
st.set_page_config(
|
|
|
10 |
layout="centered",
|
11 |
page_icon="./assets/logo.png",
|
12 |
menu_items={
|
13 |
+
"Get Help": "https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new",
|
14 |
+
"Report a Bug": "https://github.com/MilaNLProc/wordify-webapp-streamlit/issues/new",
|
15 |
+
"About": about(),
|
16 |
+
},
|
17 |
)
|
18 |
|
19 |
# logo
|
src/components.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1 |
import streamlit as st
|
|
|
|
|
2 |
from src.preprocessing import PreprocessingPipeline
|
3 |
-
from src.wordifier import input_transform,
|
4 |
-
from src.configs import PreprocessingConfigs, SupportedFiles, Languages
|
5 |
|
6 |
|
7 |
@st.experimental_memo
|
@@ -12,10 +13,16 @@ def form(df):
|
|
12 |
|
13 |
cols = [""] + df.columns.tolist()
|
14 |
label_column = st.selectbox(
|
15 |
-
"Select label column",
|
|
|
|
|
|
|
16 |
)
|
17 |
text_column = st.selectbox(
|
18 |
-
"Select text column",
|
|
|
|
|
|
|
19 |
)
|
20 |
language = st.selectbox(
|
21 |
"Select language",
|
@@ -31,12 +38,16 @@ def form(df):
|
|
31 |
pre_steps = st.multiselect(
|
32 |
"Select pre-lemmatization processing steps (ordered)",
|
33 |
options=steps_options,
|
34 |
-
default=[
|
|
|
|
|
35 |
format_func=lambda x: x.replace("_", " ").title(),
|
36 |
help="Select the processing steps to apply before the text is lemmatized",
|
37 |
)
|
38 |
|
39 |
-
lammatization_options = list(
|
|
|
|
|
40 |
lemmatization_step = st.selectbox(
|
41 |
"Select lemmatization",
|
42 |
options=lammatization_options,
|
@@ -47,7 +58,9 @@ def form(df):
|
|
47 |
post_steps = st.multiselect(
|
48 |
"Select post-lemmatization processing steps (ordered)",
|
49 |
options=steps_options,
|
50 |
-
default=[
|
|
|
|
|
51 |
format_func=lambda x: x.replace("_", " ").title(),
|
52 |
help="Select the processing steps to apply after the text is lemmatized",
|
53 |
)
|
@@ -58,7 +71,9 @@ def form(df):
|
|
58 |
|
59 |
# preprocess
|
60 |
with st.spinner("Step 1/4: Preprocessing text"):
|
61 |
-
pipe = PreprocessingPipeline(
|
|
|
|
|
62 |
df = pipe.vaex_process(df, text_column)
|
63 |
|
64 |
# prepare input
|
@@ -188,7 +203,10 @@ def presentation():
|
|
188 |
"""
|
189 |
)
|
190 |
st.table(
|
191 |
-
{
|
|
|
|
|
|
|
192 |
)
|
193 |
|
194 |
st.subheader("Output format")
|
@@ -226,6 +244,7 @@ def contacts():
|
|
226 |
<iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
|
227 |
"""
|
228 |
|
|
|
229 |
def about():
|
230 |
return """
|
231 |
The wordify team
|
|
|
1 |
import streamlit as st
|
2 |
+
|
3 |
+
from src.configs import Languages, PreprocessingConfigs, SupportedFiles
|
4 |
from src.preprocessing import PreprocessingPipeline
|
5 |
+
from src.wordifier import input_transform, output_transform, wordifier
|
|
|
6 |
|
7 |
|
8 |
@st.experimental_memo
|
|
|
13 |
|
14 |
cols = [""] + df.columns.tolist()
|
15 |
label_column = st.selectbox(
|
16 |
+
"Select label column",
|
17 |
+
cols,
|
18 |
+
index=0,
|
19 |
+
help="Select the column containing the labels",
|
20 |
)
|
21 |
text_column = st.selectbox(
|
22 |
+
"Select text column",
|
23 |
+
cols,
|
24 |
+
index=0,
|
25 |
+
help="Select the column containing the text",
|
26 |
)
|
27 |
language = st.selectbox(
|
28 |
"Select language",
|
|
|
38 |
pre_steps = st.multiselect(
|
39 |
"Select pre-lemmatization processing steps (ordered)",
|
40 |
options=steps_options,
|
41 |
+
default=[
|
42 |
+
steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
|
43 |
+
],
|
44 |
format_func=lambda x: x.replace("_", " ").title(),
|
45 |
help="Select the processing steps to apply before the text is lemmatized",
|
46 |
)
|
47 |
|
48 |
+
lammatization_options = list(
|
49 |
+
PreprocessingPipeline.lemmatization_component().keys()
|
50 |
+
)
|
51 |
lemmatization_step = st.selectbox(
|
52 |
"Select lemmatization",
|
53 |
options=lammatization_options,
|
|
|
58 |
post_steps = st.multiselect(
|
59 |
"Select post-lemmatization processing steps (ordered)",
|
60 |
options=steps_options,
|
61 |
+
default=[
|
62 |
+
steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value
|
63 |
+
],
|
64 |
format_func=lambda x: x.replace("_", " ").title(),
|
65 |
help="Select the processing steps to apply after the text is lemmatized",
|
66 |
)
|
|
|
71 |
|
72 |
# preprocess
|
73 |
with st.spinner("Step 1/4: Preprocessing text"):
|
74 |
+
pipe = PreprocessingPipeline(
|
75 |
+
language, pre_steps, lemmatization_step, post_steps
|
76 |
+
)
|
77 |
df = pipe.vaex_process(df, text_column)
|
78 |
|
79 |
# prepare input
|
|
|
203 |
"""
|
204 |
)
|
205 |
st.table(
|
206 |
+
{
|
207 |
+
"text": ["A review", "Another review", "Yet another one", "etc"],
|
208 |
+
"label": ["Good", "Bad", "Good", "etc"],
|
209 |
+
}
|
210 |
)
|
211 |
|
212 |
st.subheader("Output format")
|
|
|
244 |
<iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
|
245 |
"""
|
246 |
|
247 |
+
|
248 |
def about():
|
249 |
return """
|
250 |
The wordify team
|
src/configs.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
from enum import Enum
|
|
|
2 |
import pandas as pd
|
3 |
|
4 |
|
|
|
1 |
from enum import Enum
|
2 |
+
|
3 |
import pandas as pd
|
4 |
|
5 |
|
src/pages/about.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
|
4 |
-
def write(*args):
|
5 |
-
# ==== Contacts ==== #
|
6 |
-
with st.beta_container():
|
7 |
-
st.markdown("")
|
8 |
-
st.markdown("")
|
9 |
-
st.header(":rocket:About us")
|
10 |
-
|
11 |
-
st.markdown(
|
12 |
-
"""
|
13 |
-
You can reach out to us via email, phone, or - if you are old-fashioned - via mail
|
14 |
-
"""
|
15 |
-
)
|
16 |
-
with st.beta_expander("Contacts"):
|
17 |
-
|
18 |
-
_, col2 = st.beta_columns([0.5, 3])
|
19 |
-
col2.markdown(
|
20 |
-
"""
|
21 |
-
:email: [email protected]
|
22 |
-
|
23 |
-
:telephone_receiver: +39 02 5836 2604
|
24 |
-
|
25 |
-
:postbox: Via Röntgen n. 1, Milan 20136 (ITALY)
|
26 |
-
"""
|
27 |
-
)
|
28 |
-
|
29 |
-
st.write(
|
30 |
-
"""
|
31 |
-
<iframe src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d2798.949796165441!2d9.185730115812493!3d45.450667779100726!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4786c405ae6543c9%3A0xf2bb2313b36af88c!2sVia%20Guglielmo%20R%C3%B6ntgen%2C%201%2C%2020136%20Milano%20MI!5e0!3m2!1sit!2sit!4v1569325279433!5m2!1sit!2sit" frameborder="0" style="border:0; width: 100%; height: 312px;" allowfullscreen></iframe>
|
32 |
-
""",
|
33 |
-
unsafe_allow_html=True,
|
34 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/pages/faq.py
DELETED
@@ -1,126 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
from src.configs import Languages
|
3 |
-
|
4 |
-
|
5 |
-
def write(*args):
|
6 |
-
|
7 |
-
# ==== HOW IT WORKS ==== #
|
8 |
-
with st.beta_container():
|
9 |
-
st.markdown("")
|
10 |
-
st.markdown("")
|
11 |
-
st.markdown(
|
12 |
-
"""
|
13 |
-
Wordify makes it easy to identify words that discriminate categories in textual data.
|
14 |
-
|
15 |
-
Let's explain Wordify with an example. Imagine you are thinking about having a glass
|
16 |
-
of wine :wine_glass: with your friends :man-man-girl-girl: and you have to buy a bottle.
|
17 |
-
You know you like `bold`, `woody` wine but are unsure which one to choose.
|
18 |
-
You wonder whether there are some words that describe each type of wine.
|
19 |
-
Since you are a researcher :female-scientist: :male-scientist:, you decide to approach
|
20 |
-
the problem scientifically :microscope:. That's where Wordify comes to the rescue!
|
21 |
-
"""
|
22 |
-
)
|
23 |
-
st.markdown("")
|
24 |
-
st.markdown("")
|
25 |
-
st.header("Steps")
|
26 |
-
st.subheader("Step 1 - Prepare your data")
|
27 |
-
st.markdown(
|
28 |
-
"""
|
29 |
-
Create an Excel or CSV file with two columns for each row:
|
30 |
-
|
31 |
-
- a column with the name or the label identifying a specific object or class (e.g., in our
|
32 |
-
wine example above it would be the type of wine or the name of a specific brand). It is
|
33 |
-
common practice naming this column `label`
|
34 |
-
|
35 |
-
- a column with the text describing that specific object or class (e.g., in the wine example
|
36 |
-
above it could be the description that you find on the rear of the bottle label). It is
|
37 |
-
common practice naming this column `text`
|
38 |
-
|
39 |
-
To have reliable results, we suggest providing at least 2000 labelled texts. If you provide
|
40 |
-
less we will still wordify your file, but the results should then be taken with a grain of
|
41 |
-
salt.
|
42 |
-
|
43 |
-
Consider that we also support multi-language texts, therefore you'll be able to
|
44 |
-
automatically discriminate between international wines, even if your preferred Italian
|
45 |
-
producer does not provide you with a description written in English!
|
46 |
-
"""
|
47 |
-
)
|
48 |
-
|
49 |
-
st.subheader("Step 2 - Upload your file and Wordify!")
|
50 |
-
st.markdown(
|
51 |
-
"""
|
52 |
-
Once you have prepared your Excel or CSV file, click the "Browse File" button.
|
53 |
-
Browse for your file.
|
54 |
-
Choose the language of your texts (select multi-language if your file contains text in
|
55 |
-
different languages).
|
56 |
-
Push the "Wordify|" button, set back, and wait for wordify to do its tricks.
|
57 |
-
|
58 |
-
Depending on the size of your data, the process can take from 1 minute to 5 minutes
|
59 |
-
"""
|
60 |
-
)
|
61 |
-
|
62 |
-
# ==== FAQ ==== #
|
63 |
-
with st.beta_container():
|
64 |
-
st.markdown("")
|
65 |
-
st.markdown("")
|
66 |
-
st.header(":question:Frequently Asked Questions")
|
67 |
-
with st.beta_expander("What is Wordify?"):
|
68 |
-
st.markdown(
|
69 |
-
"""
|
70 |
-
Wordify is a way to find out which terms are most indicative for each of your dependent
|
71 |
-
variable values.
|
72 |
-
"""
|
73 |
-
)
|
74 |
-
|
75 |
-
with st.beta_expander("What happens to my data?"):
|
76 |
-
st.markdown(
|
77 |
-
"""
|
78 |
-
Nothing. We never store the data you upload on disk: it is only kept in memory for the
|
79 |
-
duration of the modeling, and then deleted. We do not retain any copies or traces of
|
80 |
-
your data.
|
81 |
-
"""
|
82 |
-
)
|
83 |
-
|
84 |
-
with st.beta_expander("What input formats do you support?"):
|
85 |
-
st.markdown(
|
86 |
-
"""
|
87 |
-
The file you upload should be .xlsx, with two columns: the first should be labeled
|
88 |
-
'text' and contain all your documents (e.g., tweets, reviews, patents, etc.), one per
|
89 |
-
line. The second column should be labeled 'label', and contain the dependent variable
|
90 |
-
label associated with each text (e.g., rating, author gender, company, etc.).
|
91 |
-
"""
|
92 |
-
)
|
93 |
-
|
94 |
-
with st.beta_expander("How does it work?"):
|
95 |
-
st.markdown(
|
96 |
-
"""
|
97 |
-
It uses a variant of the Stability Selection algorithm
|
98 |
-
[(Meinshausen and Bühlmann, 2010)](https://rss.onlinelibrary.wiley.com/doi/full/10.1111/j.1467-9868.2010.00740.x)
|
99 |
-
to fit hundreds of logistic regression models on random subsets of the data, using
|
100 |
-
different L1 penalties to drive as many of the term coefficients to 0. Any terms that
|
101 |
-
receive a non-zero coefficient in at least 30% of all model runs can be seen as stable
|
102 |
-
indicators.
|
103 |
-
"""
|
104 |
-
)
|
105 |
-
|
106 |
-
with st.beta_expander("How much data do I need?"):
|
107 |
-
st.markdown(
|
108 |
-
"""
|
109 |
-
We recommend at least 2000 instances, the more, the better. With fewer instances, the
|
110 |
-
results are less replicable and reliable.
|
111 |
-
"""
|
112 |
-
)
|
113 |
-
|
114 |
-
with st.beta_expander("Is there a paper I can cite?"):
|
115 |
-
st.markdown(
|
116 |
-
"""
|
117 |
-
Yes please! Reference coming soon...
|
118 |
-
"""
|
119 |
-
)
|
120 |
-
|
121 |
-
with st.beta_expander("What languages are supported?"):
|
122 |
-
st.markdown(
|
123 |
-
f"""
|
124 |
-
Currently we support: {", ".join([i.name for i in Languages])}.
|
125 |
-
"""
|
126 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/pages/home.py
DELETED
@@ -1,240 +0,0 @@
|
|
1 |
-
from src.configs import Languages
|
2 |
-
from src.utils import read_file, download_button
|
3 |
-
from src.plotting import plot_labels_prop, plot_nchars, plot_score
|
4 |
-
from src.preprocessing import Lemmatizer, PreprocessingPipeline, encode
|
5 |
-
from src.wordifier import wordifier
|
6 |
-
import streamlit as st
|
7 |
-
|
8 |
-
|
9 |
-
def write(session, uploaded_file):
|
10 |
-
|
11 |
-
if not uploaded_file:
|
12 |
-
st.markdown(
|
13 |
-
"""
|
14 |
-
Hi, welcome to __Wordify__! :rocket:
|
15 |
-
|
16 |
-
Start by uploading a file - CSV, XLSX (avoid Strict Open XML Spreadsheet format [here](https://stackoverflow.com/questions/62800822/openpyxl-cannot-read-strict-open-xml-spreadsheet-format-userwarning-file-conta)),
|
17 |
-
or PARQUET are currently supported.
|
18 |
-
|
19 |
-
Once you have uploaded the file, __Wordify__ will show an interactive UI through which
|
20 |
-
you'll be able to interactively decide the text preprocessing steps, their order, and
|
21 |
-
proceed to Wordify your text.
|
22 |
-
|
23 |
-
If you're ready, let's jump in:
|
24 |
-
|
25 |
-
:point_left: upload a file via the upload widget in the sidebar!
|
26 |
-
|
27 |
-
NOTE: whenever you want to reset everything, simply refresh the page.
|
28 |
-
"""
|
29 |
-
)
|
30 |
-
|
31 |
-
elif uploaded_file:
|
32 |
-
|
33 |
-
# ==== 1. READ FILE ==== #
|
34 |
-
with st.spinner("Reading file"):
|
35 |
-
# TODO: write parser function that automatically understands format
|
36 |
-
data = read_file(uploaded_file)
|
37 |
-
|
38 |
-
# 2. CREATE UI TO SELECT COLUMNS
|
39 |
-
col1, col2, col3 = st.beta_columns(3)
|
40 |
-
with col1:
|
41 |
-
language = st.selectbox("Select language", [i.name for i in Languages])
|
42 |
-
with st.beta_expander("Description"):
|
43 |
-
st.markdown(
|
44 |
-
f"Select a language amongst those supported: {', '.join([f'`{i.name}`' for i in Languages])}. This will be used to lemmatize and remove stopwords."
|
45 |
-
)
|
46 |
-
with col2:
|
47 |
-
cols_options = [""] + data.columns.tolist()
|
48 |
-
label_column = st.selectbox(
|
49 |
-
"Select label column name", cols_options, index=0
|
50 |
-
)
|
51 |
-
with st.beta_expander("Description"):
|
52 |
-
st.markdown("Select the column containing the labels.")
|
53 |
-
|
54 |
-
if label_column:
|
55 |
-
plot = plot_labels_prop(data, label_column)
|
56 |
-
if plot:
|
57 |
-
st.altair_chart(plot, use_container_width=True)
|
58 |
-
|
59 |
-
with col3:
|
60 |
-
text_column = st.selectbox("Select text column name", cols_options, index=0)
|
61 |
-
with st.beta_expander("Description"):
|
62 |
-
st.markdown("Select the column containing the texts.")
|
63 |
-
|
64 |
-
if text_column:
|
65 |
-
st.altair_chart(
|
66 |
-
plot_nchars(data, text_column), use_container_width=True
|
67 |
-
)
|
68 |
-
|
69 |
-
# ==== 2.1 CREATE UI FOR ADVANCED OPTIONS ==== #
|
70 |
-
with st.beta_expander("Advanced options"):
|
71 |
-
|
72 |
-
steps_options = list(PreprocessingPipeline.pipeline_components().keys())
|
73 |
-
|
74 |
-
# stopwords option and
|
75 |
-
col1, col2 = st.beta_columns([1, 3])
|
76 |
-
with col1:
|
77 |
-
st.markdown("Remove stopwords (uses Spacy vocabulary)")
|
78 |
-
with col2:
|
79 |
-
remove_stopwords_elem = st.empty()
|
80 |
-
|
81 |
-
# lemmatization option
|
82 |
-
col1, col2 = st.beta_columns([1, 3])
|
83 |
-
with col1:
|
84 |
-
st.markdown("Lemmatizes text (uses Spacy)")
|
85 |
-
with col2:
|
86 |
-
lemmatization_elem = st.empty()
|
87 |
-
|
88 |
-
# pre-lemmatization cleaning steps and
|
89 |
-
# post-lemmatization cleaning steps
|
90 |
-
col1, col2 = st.beta_columns([1, 3])
|
91 |
-
with col1:
|
92 |
-
st.markdown(
|
93 |
-
f"""
|
94 |
-
Define a pipeline of cleaning steps that is applied before and/or after lemmatization.
|
95 |
-
The available cleaning steps are:\n
|
96 |
-
{", ".join([f"`{x.replace('_', ' ').title()}`" for x in steps_options])}
|
97 |
-
"""
|
98 |
-
)
|
99 |
-
with col2:
|
100 |
-
pre_steps_elem = st.empty()
|
101 |
-
post_steps_elem = st.empty()
|
102 |
-
reset_button = st.empty()
|
103 |
-
|
104 |
-
# implement reset logic
|
105 |
-
if reset_button.button("Reset steps"):
|
106 |
-
session.run_id += 1
|
107 |
-
|
108 |
-
pre_steps = pre_steps_elem.multiselect(
|
109 |
-
"Select pre-lemmatization preprocessing steps (ordered)",
|
110 |
-
options=steps_options,
|
111 |
-
default=steps_options,
|
112 |
-
format_func=lambda x: x.replace("_", " ").title(),
|
113 |
-
key=session.run_id,
|
114 |
-
)
|
115 |
-
post_steps = post_steps_elem.multiselect(
|
116 |
-
"Select post-lemmatization processing steps (ordered)",
|
117 |
-
options=steps_options,
|
118 |
-
default=steps_options[-4:],
|
119 |
-
format_func=lambda x: x.replace("_", " ").title(),
|
120 |
-
key=session.run_id,
|
121 |
-
)
|
122 |
-
remove_stopwords = remove_stopwords_elem.checkbox(
|
123 |
-
"Remove stopwords",
|
124 |
-
value=True,
|
125 |
-
key=session.run_id,
|
126 |
-
)
|
127 |
-
lemmatization = lemmatization_elem.checkbox(
|
128 |
-
"Lemmatize text",
|
129 |
-
value=True,
|
130 |
-
key=session.run_id,
|
131 |
-
)
|
132 |
-
|
133 |
-
# show sample checkbox
|
134 |
-
col1, col2 = st.beta_columns([1, 2])
|
135 |
-
with col1:
|
136 |
-
show_sample = st.checkbox("Show sample of preprocessed text")
|
137 |
-
|
138 |
-
# initialize text preprocessor
|
139 |
-
preprocessing_pipeline = PreprocessingPipeline(
|
140 |
-
pre_steps=pre_steps,
|
141 |
-
lemmatizer=Lemmatizer(
|
142 |
-
language=language,
|
143 |
-
remove_stop=remove_stopwords,
|
144 |
-
lemmatization=lemmatization,
|
145 |
-
),
|
146 |
-
post_steps=post_steps,
|
147 |
-
)
|
148 |
-
|
149 |
-
print(preprocessing_pipeline.pre_steps)
|
150 |
-
|
151 |
-
# ==== 3. PROVIDE FEEDBACK ON OPTIONS ==== #
|
152 |
-
if show_sample and not (label_column and text_column):
|
153 |
-
st.warning("Please select `label` and `text` columns")
|
154 |
-
|
155 |
-
elif show_sample and (label_column and text_column):
|
156 |
-
sample_data = data.sample(5)
|
157 |
-
sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
|
158 |
-
sample_data[text_column]
|
159 |
-
).values
|
160 |
-
|
161 |
-
print(sample_data)
|
162 |
-
st.table(
|
163 |
-
sample_data.loc[
|
164 |
-
:, [label_column, text_column, f"preprocessed_{text_column}"]
|
165 |
-
]
|
166 |
-
)
|
167 |
-
|
168 |
-
# ==== 4. RUN ==== #
|
169 |
-
run_button = st.button("Wordify!")
|
170 |
-
if run_button and not (label_column and text_column):
|
171 |
-
st.warning("Please select `label` and `text` columns")
|
172 |
-
|
173 |
-
elif run_button and (label_column and text_column) and not session.process:
|
174 |
-
|
175 |
-
with st.spinner("Process started"):
|
176 |
-
# data = data.head()
|
177 |
-
data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
|
178 |
-
data[text_column]
|
179 |
-
).values
|
180 |
-
|
181 |
-
print(data.head())
|
182 |
-
|
183 |
-
inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
|
184 |
-
session.posdf, session.negdf = wordifier(**inputs)
|
185 |
-
st.success("Wordified!")
|
186 |
-
|
187 |
-
# session.posdf, session.negdf = process(data, text_column, label_column)
|
188 |
-
session.process = True
|
189 |
-
|
190 |
-
# ==== 5. RESULTS ==== #
|
191 |
-
if session.process and (label_column and text_column):
|
192 |
-
st.markdown("")
|
193 |
-
st.markdown("")
|
194 |
-
st.header("Results")
|
195 |
-
|
196 |
-
# col1, col2, _ = st.beta_columns(3)
|
197 |
-
col1, col2, col3 = st.beta_columns([2, 3, 3])
|
198 |
-
|
199 |
-
with col1:
|
200 |
-
label = st.selectbox(
|
201 |
-
"Select label", data[label_column].unique().tolist()
|
202 |
-
)
|
203 |
-
# # with col2:
|
204 |
-
# thres = st.slider(
|
205 |
-
# "Select threshold",
|
206 |
-
# min_value=0,
|
207 |
-
# max_value=100,
|
208 |
-
# step=1,
|
209 |
-
# format="%f",
|
210 |
-
# value=30,
|
211 |
-
# )
|
212 |
-
show_plots = st.checkbox("Show plots of top 100")
|
213 |
-
|
214 |
-
with col2:
|
215 |
-
st.subheader(f"Words __positively__ identifying label `{label}`")
|
216 |
-
st.write(
|
217 |
-
session.posdf[session.posdf[label_column] == label].sort_values(
|
218 |
-
"score", ascending=False
|
219 |
-
)
|
220 |
-
)
|
221 |
-
download_button(session.posdf, "positive_data")
|
222 |
-
if show_plots:
|
223 |
-
st.altair_chart(
|
224 |
-
plot_score(session.posdf, label_column, label),
|
225 |
-
use_container_width=True,
|
226 |
-
)
|
227 |
-
|
228 |
-
with col3:
|
229 |
-
st.subheader(f"Words __negatively__ identifying label `{label}`")
|
230 |
-
st.write(
|
231 |
-
session.negdf[session.negdf[label_column] == label].sort_values(
|
232 |
-
"score", ascending=False
|
233 |
-
)
|
234 |
-
)
|
235 |
-
download_button(session.negdf, "negative_data")
|
236 |
-
if show_plots:
|
237 |
-
st.altair_chart(
|
238 |
-
plot_score(session.negdf, label_column, label),
|
239 |
-
use_container_width=True,
|
240 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/plotting.py
DELETED
@@ -1,84 +0,0 @@
|
|
1 |
-
import altair as alt
|
2 |
-
import pandas as pd
|
3 |
-
import streamlit as st
|
4 |
-
from stqdm import stqdm
|
5 |
-
|
6 |
-
stqdm.pandas()
|
7 |
-
|
8 |
-
|
9 |
-
def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
10 |
-
|
11 |
-
unique_value_limit = 100
|
12 |
-
|
13 |
-
if data[label_column].nunique() > unique_value_limit:
|
14 |
-
|
15 |
-
st.warning(
|
16 |
-
f"""
|
17 |
-
The column you selected has more than {unique_value_limit}.
|
18 |
-
Are you sure it's the right column? If it is, please note that
|
19 |
-
this will impact __Wordify__ performance.
|
20 |
-
"""
|
21 |
-
)
|
22 |
-
|
23 |
-
return
|
24 |
-
|
25 |
-
source = (
|
26 |
-
data[label_column]
|
27 |
-
.value_counts()
|
28 |
-
.reset_index()
|
29 |
-
.rename(columns={"index": "Labels", label_column: "Counts"})
|
30 |
-
)
|
31 |
-
source["Props"] = source["Counts"] / source["Counts"].sum()
|
32 |
-
source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
|
33 |
-
|
34 |
-
bars = (
|
35 |
-
alt.Chart(source)
|
36 |
-
.mark_bar()
|
37 |
-
.encode(
|
38 |
-
x=alt.X("Labels:O", sort="-y"),
|
39 |
-
y="Counts:Q",
|
40 |
-
)
|
41 |
-
)
|
42 |
-
|
43 |
-
text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
|
44 |
-
text="Proportions:O"
|
45 |
-
)
|
46 |
-
|
47 |
-
return (bars + text).properties(height=300)
|
48 |
-
|
49 |
-
|
50 |
-
def plot_nchars(data: pd.DataFrame, text_column: str):
|
51 |
-
source = data[text_column].str.len().to_frame()
|
52 |
-
|
53 |
-
plot = (
|
54 |
-
alt.Chart(source)
|
55 |
-
.mark_bar()
|
56 |
-
.encode(
|
57 |
-
alt.X(
|
58 |
-
f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
|
59 |
-
),
|
60 |
-
alt.Y("count()", axis=alt.Axis(title="")),
|
61 |
-
)
|
62 |
-
)
|
63 |
-
|
64 |
-
return plot.properties(height=300)
|
65 |
-
|
66 |
-
|
67 |
-
def plot_score(data: pd.DataFrame, label_col: str, label: str):
|
68 |
-
|
69 |
-
source = (
|
70 |
-
data.loc[data[label_col] == label]
|
71 |
-
.sort_values("score", ascending=False)
|
72 |
-
.head(100)
|
73 |
-
)
|
74 |
-
|
75 |
-
plot = (
|
76 |
-
alt.Chart(source)
|
77 |
-
.mark_bar()
|
78 |
-
.encode(
|
79 |
-
y=alt.Y("word:O", sort="-x"),
|
80 |
-
x="score:Q",
|
81 |
-
)
|
82 |
-
)
|
83 |
-
|
84 |
-
return plot.properties(height=max(30 * source.shape[0], 50))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/preprocessing.py
CHANGED
@@ -6,10 +6,10 @@ from collections import OrderedDict
|
|
6 |
from typing import Callable, List, Optional
|
7 |
|
8 |
import pandas as pd
|
9 |
-
from pandas.core.frame import DataFrame
|
10 |
import spacy
|
11 |
import streamlit as st
|
12 |
import vaex
|
|
|
13 |
from pandas.core.series import Series
|
14 |
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
15 |
|
@@ -103,7 +103,9 @@ class PreprocessingPipeline:
|
|
103 |
return self.post(self.lemma(self.nlp(self.pre(t))))
|
104 |
|
105 |
vdf = vaex.from_pandas(df)
|
106 |
-
vdf["processed_text"] = vdf.apply(
|
|
|
|
|
107 |
|
108 |
return vdf.to_pandas_df()
|
109 |
|
@@ -115,7 +117,9 @@ class PreprocessingPipeline:
|
|
115 |
total_steps = len(series) // 100
|
116 |
res = []
|
117 |
pbar = st.progress(0)
|
118 |
-
for i, doc in enumerate(
|
|
|
|
|
119 |
res.append(self.lemma(doc))
|
120 |
|
121 |
if i % total_steps == 0:
|
|
|
6 |
from typing import Callable, List, Optional
|
7 |
|
8 |
import pandas as pd
|
|
|
9 |
import spacy
|
10 |
import streamlit as st
|
11 |
import vaex
|
12 |
+
from pandas.core.frame import DataFrame
|
13 |
from pandas.core.series import Series
|
14 |
from textacy.preprocessing import make_pipeline, normalize, remove, replace
|
15 |
|
|
|
103 |
return self.post(self.lemma(self.nlp(self.pre(t))))
|
104 |
|
105 |
vdf = vaex.from_pandas(df)
|
106 |
+
vdf["processed_text"] = vdf.apply(
|
107 |
+
fn, arguments=[vdf[text_column]], vectorize=False
|
108 |
+
)
|
109 |
|
110 |
return vdf.to_pandas_df()
|
111 |
|
|
|
117 |
total_steps = len(series) // 100
|
118 |
res = []
|
119 |
pbar = st.progress(0)
|
120 |
+
for i, doc in enumerate(
|
121 |
+
self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
|
122 |
+
):
|
123 |
res.append(self.lemma(doc))
|
124 |
|
125 |
if i % total_steps == 0:
|
src/session_state.py
DELETED
@@ -1,121 +0,0 @@
|
|
1 |
-
"""Hack to add per-session state to Streamlit.
|
2 |
-
|
3 |
-
Usage
|
4 |
-
-----
|
5 |
-
|
6 |
-
>>> import SessionState
|
7 |
-
>>>
|
8 |
-
>>> session_state = SessionState.get(user_name='', favorite_color='black')
|
9 |
-
>>> session_state.user_name
|
10 |
-
''
|
11 |
-
>>> session_state.user_name = 'Mary'
|
12 |
-
>>> session_state.favorite_color
|
13 |
-
'black'
|
14 |
-
|
15 |
-
Since you set user_name above, next time your script runs this will be the
|
16 |
-
result:
|
17 |
-
>>> session_state = get(user_name='', favorite_color='black')
|
18 |
-
>>> session_state.user_name
|
19 |
-
'Mary'
|
20 |
-
|
21 |
-
"""
|
22 |
-
try:
|
23 |
-
import streamlit.ReportThread as ReportThread
|
24 |
-
from streamlit.server.Server import Server
|
25 |
-
except Exception:
|
26 |
-
# Streamlit >= 0.65.0
|
27 |
-
import streamlit.report_thread as ReportThread
|
28 |
-
from streamlit.server.server import Server
|
29 |
-
|
30 |
-
|
31 |
-
class SessionState(object):
|
32 |
-
def __init__(self, **kwargs):
|
33 |
-
"""A new SessionState object.
|
34 |
-
|
35 |
-
Parameters
|
36 |
-
----------
|
37 |
-
**kwargs : any
|
38 |
-
Default values for the session state.
|
39 |
-
|
40 |
-
Example
|
41 |
-
-------
|
42 |
-
>>> session_state = SessionState(user_name='', favorite_color='black')
|
43 |
-
>>> session_state.user_name = 'Mary'
|
44 |
-
''
|
45 |
-
>>> session_state.favorite_color
|
46 |
-
'black'
|
47 |
-
|
48 |
-
"""
|
49 |
-
for key, val in kwargs.items():
|
50 |
-
setattr(self, key, val)
|
51 |
-
|
52 |
-
|
53 |
-
def get(**kwargs):
|
54 |
-
"""Gets a SessionState object for the current session.
|
55 |
-
|
56 |
-
Creates a new object if necessary.
|
57 |
-
|
58 |
-
Parameters
|
59 |
-
----------
|
60 |
-
**kwargs : any
|
61 |
-
Default values you want to add to the session state, if we're creating a
|
62 |
-
new one.
|
63 |
-
|
64 |
-
Example
|
65 |
-
-------
|
66 |
-
>>> session_state = get(user_name='', favorite_color='black')
|
67 |
-
>>> session_state.user_name
|
68 |
-
''
|
69 |
-
>>> session_state.user_name = 'Mary'
|
70 |
-
>>> session_state.favorite_color
|
71 |
-
'black'
|
72 |
-
|
73 |
-
Since you set user_name above, next time your script runs this will be the
|
74 |
-
result:
|
75 |
-
>>> session_state = get(user_name='', favorite_color='black')
|
76 |
-
>>> session_state.user_name
|
77 |
-
'Mary'
|
78 |
-
|
79 |
-
"""
|
80 |
-
# Hack to get the session object from Streamlit.
|
81 |
-
|
82 |
-
ctx = ReportThread.get_report_ctx()
|
83 |
-
|
84 |
-
this_session = None
|
85 |
-
|
86 |
-
current_server = Server.get_current()
|
87 |
-
if hasattr(current_server, "_session_infos"):
|
88 |
-
# Streamlit < 0.56
|
89 |
-
session_infos = Server.get_current()._session_infos.values()
|
90 |
-
else:
|
91 |
-
session_infos = Server.get_current()._session_info_by_id.values()
|
92 |
-
|
93 |
-
for session_info in session_infos:
|
94 |
-
s = session_info.session
|
95 |
-
if (
|
96 |
-
# Streamlit < 0.54.0
|
97 |
-
(hasattr(s, "_main_dg") and s._main_dg == ctx.main_dg)
|
98 |
-
or
|
99 |
-
# Streamlit >= 0.54.0
|
100 |
-
(not hasattr(s, "_main_dg") and s.enqueue == ctx.enqueue)
|
101 |
-
or
|
102 |
-
# Streamlit >= 0.65.2
|
103 |
-
(
|
104 |
-
not hasattr(s, "_main_dg")
|
105 |
-
and s._uploaded_file_mgr == ctx.uploaded_file_mgr
|
106 |
-
)
|
107 |
-
):
|
108 |
-
this_session = s
|
109 |
-
|
110 |
-
if this_session is None:
|
111 |
-
raise RuntimeError(
|
112 |
-
"Oh noes. Couldn't get your Streamlit Session object. "
|
113 |
-
"Are you doing something fancy with threads?"
|
114 |
-
)
|
115 |
-
|
116 |
-
# Got the session object! Now let's attach some state into it.
|
117 |
-
|
118 |
-
if not hasattr(this_session, "_custom_session_state"):
|
119 |
-
this_session._custom_session_state = SessionState(**kwargs)
|
120 |
-
|
121 |
-
return this_session._custom_session_state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/utils.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import base64
|
|
|
2 |
import altair as alt
|
3 |
import pandas as pd
|
4 |
import streamlit as st
|
@@ -7,7 +8,6 @@ from PIL import Image
|
|
7 |
from .configs import SupportedFiles
|
8 |
|
9 |
|
10 |
-
|
11 |
@st.cache
|
12 |
def get_logo(path):
|
13 |
return Image.open(path)
|
@@ -52,7 +52,12 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
|
52 |
|
53 |
return
|
54 |
|
55 |
-
source =
|
|
|
|
|
|
|
|
|
|
|
56 |
source["Props"] = source["Counts"] / source["Counts"].sum()
|
57 |
source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
|
58 |
|
@@ -65,7 +70,9 @@ def plot_labels_prop(data: pd.DataFrame, label_column: str):
|
|
65 |
)
|
66 |
)
|
67 |
|
68 |
-
text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
|
|
|
|
|
69 |
|
70 |
return (bars + text).properties(height=300)
|
71 |
|
@@ -77,7 +84,9 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
|
|
77 |
alt.Chart(source)
|
78 |
.mark_bar()
|
79 |
.encode(
|
80 |
-
alt.X(
|
|
|
|
|
81 |
alt.Y("count()", axis=alt.Axis(title="")),
|
82 |
)
|
83 |
)
|
@@ -87,7 +96,11 @@ def plot_nchars(data: pd.DataFrame, text_column: str):
|
|
87 |
|
88 |
def plot_score(data: pd.DataFrame, label_col: str, label: str):
|
89 |
|
90 |
-
source =
|
|
|
|
|
|
|
|
|
91 |
|
92 |
plot = (
|
93 |
alt.Chart(source)
|
|
|
1 |
import base64
|
2 |
+
|
3 |
import altair as alt
|
4 |
import pandas as pd
|
5 |
import streamlit as st
|
|
|
8 |
from .configs import SupportedFiles
|
9 |
|
10 |
|
|
|
11 |
@st.cache
|
12 |
def get_logo(path):
|
13 |
return Image.open(path)
|
|
|
52 |
|
53 |
return
|
54 |
|
55 |
+
source = (
|
56 |
+
data[label_column]
|
57 |
+
.value_counts()
|
58 |
+
.reset_index()
|
59 |
+
.rename(columns={"index": "Labels", label_column: "Counts"})
|
60 |
+
)
|
61 |
source["Props"] = source["Counts"] / source["Counts"].sum()
|
62 |
source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
|
63 |
|
|
|
70 |
)
|
71 |
)
|
72 |
|
73 |
+
text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
|
74 |
+
text="Proportions:O"
|
75 |
+
)
|
76 |
|
77 |
return (bars + text).properties(height=300)
|
78 |
|
|
|
84 |
alt.Chart(source)
|
85 |
.mark_bar()
|
86 |
.encode(
|
87 |
+
alt.X(
|
88 |
+
f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
|
89 |
+
),
|
90 |
alt.Y("count()", axis=alt.Axis(title="")),
|
91 |
)
|
92 |
)
|
|
|
96 |
|
97 |
def plot_score(data: pd.DataFrame, label_col: str, label: str):
|
98 |
|
99 |
+
source = (
|
100 |
+
data.loc[data[label_col] == label]
|
101 |
+
.sort_values("score", ascending=False)
|
102 |
+
.head(100)
|
103 |
+
)
|
104 |
|
105 |
plot = (
|
106 |
alt.Chart(source)
|
src/wordifier.py
CHANGED
@@ -12,7 +12,9 @@ from sklearn.utils import resample
|
|
12 |
from .configs import InputTransformConfigs, ModelConfigs
|
13 |
|
14 |
|
15 |
-
def input_transform(
|
|
|
|
|
16 |
"""
|
17 |
Encodes text in mathematical object ameanable to training algorithm
|
18 |
"""
|
@@ -45,7 +47,11 @@ def input_transform(text: pd.Series, labels: pd.Series, configs=InputTransformCo
|
|
45 |
|
46 |
|
47 |
def wordifier(
|
48 |
-
X: np.ndarray,
|
|
|
|
|
|
|
|
|
49 |
) -> List[Tuple[str, float, str]]:
|
50 |
|
51 |
n_instances, n_features = X.shape
|
@@ -85,7 +91,9 @@ def wordifier(
|
|
85 |
)
|
86 |
|
87 |
# sample indices to subsample matrix
|
88 |
-
selection = resample(
|
|
|
|
|
89 |
|
90 |
# fit
|
91 |
try:
|
@@ -110,20 +118,36 @@ def wordifier(
|
|
110 |
neg_scores = neg_scores / configs.NUM_ITERS.value
|
111 |
|
112 |
# get only active features
|
113 |
-
pos_positions = np.where(
|
114 |
-
|
|
|
|
|
|
|
|
|
115 |
|
116 |
# prepare DataFrame
|
117 |
-
pos = [
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
return pos, neg
|
121 |
|
122 |
|
123 |
-
def output_transform(
|
124 |
-
|
|
|
|
|
|
|
|
|
125 |
posdf["correlation"] = "positive"
|
126 |
-
negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
|
|
|
|
|
127 |
negdf["correlation"] = "negative"
|
128 |
|
129 |
output = pd.concat([posdf, negdf], ignore_index=False, axis=0)
|
|
|
12 |
from .configs import InputTransformConfigs, ModelConfigs
|
13 |
|
14 |
|
15 |
+
def input_transform(
|
16 |
+
text: pd.Series, labels: pd.Series, configs=InputTransformConfigs
|
17 |
+
) -> Dict[str, np.ndarray]:
|
18 |
"""
|
19 |
Encodes text in mathematical object ameanable to training algorithm
|
20 |
"""
|
|
|
47 |
|
48 |
|
49 |
def wordifier(
|
50 |
+
X: np.ndarray,
|
51 |
+
y: np.ndarray,
|
52 |
+
X_names: List[str],
|
53 |
+
y_names: List[str],
|
54 |
+
configs=ModelConfigs,
|
55 |
) -> List[Tuple[str, float, str]]:
|
56 |
|
57 |
n_instances, n_features = X.shape
|
|
|
91 |
)
|
92 |
|
93 |
# sample indices to subsample matrix
|
94 |
+
selection = resample(
|
95 |
+
np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
|
96 |
+
)
|
97 |
|
98 |
# fit
|
99 |
try:
|
|
|
118 |
neg_scores = neg_scores / configs.NUM_ITERS.value
|
119 |
|
120 |
# get only active features
|
121 |
+
pos_positions = np.where(
|
122 |
+
pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
|
123 |
+
)
|
124 |
+
neg_positions = np.where(
|
125 |
+
neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
|
126 |
+
)
|
127 |
|
128 |
# prepare DataFrame
|
129 |
+
pos = [
|
130 |
+
(X_names[i], pos_scores[c, i], y_names[c])
|
131 |
+
for c, i in zip(*pos_positions.nonzero())
|
132 |
+
]
|
133 |
+
neg = [
|
134 |
+
(X_names[i], neg_scores[c, i], y_names[c])
|
135 |
+
for c, i in zip(*neg_positions.nonzero())
|
136 |
+
]
|
137 |
|
138 |
return pos, neg
|
139 |
|
140 |
|
141 |
+
def output_transform(
|
142 |
+
pos: List[Tuple[str, float, str]], neg: List[Tuple[str, float, str]]
|
143 |
+
) -> DataFrame:
|
144 |
+
posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
|
145 |
+
["label", "score"], ascending=False
|
146 |
+
)
|
147 |
posdf["correlation"] = "positive"
|
148 |
+
negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
|
149 |
+
["label", "score"], ascending=False
|
150 |
+
)
|
151 |
negdf["correlation"] = "negative"
|
152 |
|
153 |
output = pd.concat([posdf, negdf], ignore_index=False, axis=0)
|