HLasse commited on
Commit
a177196
·
1 Parent(s): 34563c5

feat: multifile processing

Browse files
Files changed (3) hide show
  1. app.py +45 -37
  2. data_viewer.py +9 -6
  3. process_text.py +66 -0
app.py CHANGED
@@ -5,11 +5,12 @@ Dashboard for showcasing extraction of text metrics with textdescriptives.
5
 
6
  from io import StringIO
7
 
8
- import numpy as np
9
  import streamlit as st
10
  import textdescriptives as td
11
 
12
  from data_viewer import DataViewer
 
13
  from options import (
14
  all_model_size_options_pretty_to_short,
15
  available_model_size_options,
@@ -28,7 +29,7 @@ with col1:
28
  with col2:
29
  st.image(
30
  "https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
31
- width=125
32
  )
33
 
34
  st.write(
@@ -46,8 +47,8 @@ st.write(
46
 
47
  st.caption(
48
  "Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
49
- "calculating a large variety of statistics from text. "
50
- "[arXiv preprint arXiv:2301.02057](https://arxiv.org/abs/2301.02057)"
51
  )
52
 
53
 
@@ -57,22 +58,25 @@ st.caption(
57
 
58
 
59
  input_choice = st.radio(
60
- label="Input", options=["Enter text", "Upload file"], index=0, horizontal=True
61
  )
62
 
63
  with st.form(key="settings_form"):
64
  split_by_line = st.checkbox(label="Split by newline", value=True)
65
 
66
- string_data = None
67
 
68
- if input_choice == "Upload file":
69
- uploaded_file = st.file_uploader(
70
- label="Choose a .txt file", type=["txt"], accept_multiple_files=False
71
  )
72
 
73
- if uploaded_file is not None:
74
  # To convert to a string based IO:
75
- string_data = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
 
 
 
76
 
77
  else:
78
  default_text = """Hello, morning dew. The grass whispers low.
@@ -81,9 +85,11 @@ Good morning, world. The birds sing in delight.
81
  Let's spread our wings. The butterflies take flight.
82
  Nature's chorus sings, a symphony of light."""
83
 
84
- string_data = st.text_area(
85
- label="Enter text", value=default_text, height=145, max_chars=None
86
- )
 
 
87
 
88
  # Row of selectors
89
  col1, col2 = st.columns([1, 1])
@@ -132,30 +138,26 @@ Nature's chorus sings, a symphony of light."""
132
  #############
133
 
134
 
135
- if apply_settings_button and string_data is not None and string_data:
136
  if model_size_pretty not in available_model_size_options(lang=language_short):
137
  st.write(
138
  "**Sorry!** The chosen *model size* is not available in this language. Please try another."
139
  )
140
  else:
141
- # Clean and (optionally) split the text
142
- string_data = string_data.strip()
143
- if split_by_line:
144
- string_data = string_data.split("\n")
145
- else:
146
- string_data = [string_data]
147
-
148
- # Remove empty strings
149
- # E.g. due to consecutive newlines
150
- string_data = [s for s in string_data if s]
151
-
152
- # Will automatically download the relevant model and extract all metrics
153
- # TODO: Download beforehand to speed up inference
154
- df = td.extract_metrics(
155
- text=string_data,
156
- lang=language_short,
157
- spacy_model_size=model_size_short,
158
- metrics=metrics,
159
  )
160
 
161
  ###################
@@ -165,13 +167,15 @@ if apply_settings_button and string_data is not None and string_data:
165
  # Create 2 columns with 1) the output header
166
  # and 2) a download button
167
  DataViewer()._header_and_download(
168
- header="The calculated metrics", data=df, file_name="text_metrics.csv"
 
 
169
  )
170
 
171
  st.write("**Note**: This data frame has been transposed for readability.")
172
- df = df.transpose().reset_index()
173
- df.columns = ["Metric"] + [str(c) for c in list(df.columns)[1:]]
174
- st.dataframe(data=df, use_container_width=True)
175
 
176
 
177
  ############################
@@ -182,6 +186,10 @@ if apply_settings_button and string_data is not None and string_data:
182
  with st.expander("See python code"):
183
  st.code(
184
  """
 
 
 
 
185
  import textdescriptives as td
186
 
187
  # Given a string of text and the settings
 
5
 
6
  from io import StringIO
7
 
8
+ import pandas as pd
9
  import streamlit as st
10
  import textdescriptives as td
11
 
12
  from data_viewer import DataViewer
13
+ from process_text import text_to_metrics
14
  from options import (
15
  all_model_size_options_pretty_to_short,
16
  available_model_size_options,
 
29
  with col2:
30
  st.image(
31
  "https://github.com/HLasse/TextDescriptives/raw/main/docs/_static/icon.png",
32
+ width=125,
33
  )
34
 
35
  st.write(
 
47
 
48
  st.caption(
49
  "Hansen, L., Olsen, L. R., & Enevoldsen, K. (2023). TextDescriptives: A Python package for "
50
+ "calculating a large variety of metrics from text. [Journal of Open Source Software, 8(84), "
51
+ "5153, https://doi.org/10.21105/joss.05153](https://doi.org/10.21105/joss.05153)"
52
  )
53
 
54
 
 
58
 
59
 
60
  input_choice = st.radio(
61
+ label="Input", options=["Enter text", "Upload file(s)"], index=0, horizontal=True
62
  )
63
 
64
  with st.form(key="settings_form"):
65
  split_by_line = st.checkbox(label="Split by newline", value=True)
66
 
67
+ file_name_to_text_string = {}
68
 
69
+ if input_choice == "Upload file(s)":
70
+ uploaded_files = st.file_uploader(
71
+ label="Choose a .txt file", type=["txt"], accept_multiple_files=True
72
  )
73
 
74
+ if uploaded_files is not None and len(uploaded_files) > 0:
75
  # To convert to a string based IO:
76
+ file_name_to_text_string = {
77
+ file.name: StringIO(file.getvalue().decode("utf-8")).read()
78
+ for file in uploaded_files
79
+ }
80
 
81
  else:
82
  default_text = """Hello, morning dew. The grass whispers low.
 
85
  Let's spread our wings. The butterflies take flight.
86
  Nature's chorus sings, a symphony of light."""
87
 
88
+ file_name_to_text_string = {
89
+ "input": st.text_area(
90
+ label="Enter text", value=default_text, height=145, max_chars=None
91
+ )
92
+ }
93
 
94
  # Row of selectors
95
  col1, col2 = st.columns([1, 1])
 
138
  #############
139
 
140
 
141
+ if apply_settings_button and len(file_name_to_text_string) > 0:
142
  if model_size_pretty not in available_model_size_options(lang=language_short):
143
  st.write(
144
  "**Sorry!** The chosen *model size* is not available in this language. Please try another."
145
  )
146
  else:
147
+ # Extract metrics for each text
148
+ output_df = pd.concat(
149
+ [
150
+ text_to_metrics(
151
+ string=string,
152
+ language_short=language_short,
153
+ model_size_short=model_size_short,
154
+ metrics=metrics,
155
+ split_by_line=split_by_line,
156
+ filename=filename if "Upload" in input_choice else None,
157
+ )
158
+ for filename, string in file_name_to_text_string.items()
159
+ ],
160
+ ignore_index=True,
 
 
 
 
161
  )
162
 
163
  ###################
 
167
  # Create 2 columns with 1) the output header
168
  # and 2) a download button
169
  DataViewer()._header_and_download(
170
+ header="The calculated metrics",
171
+ data=output_df,
172
+ file_name="text_metrics.csv",
173
  )
174
 
175
  st.write("**Note**: This data frame has been transposed for readability.")
176
+ output_df = output_df.transpose().reset_index()
177
+ output_df.columns = ["Metric"] + [str(c) for c in list(output_df.columns)[1:]]
178
+ st.dataframe(data=output_df, use_container_width=True)
179
 
180
 
181
  ############################
 
186
  with st.expander("See python code"):
187
  st.code(
188
  """
189
+ # Note: This is the code for a single text file
190
+ # The actual code is slightly more complex
191
+ # to allow processing multiple files at once
192
+
193
  import textdescriptives as td
194
 
195
  # Given a string of text and the settings
data_viewer.py CHANGED
@@ -1,14 +1,17 @@
 
 
 
1
 
2
  import streamlit as st
3
 
4
 
5
  class DataViewer:
6
-
7
- # @st.cache_data
8
  def _convert_df_to_csv(self, data, **kwargs):
9
- return data.to_csv(**kwargs).encode('utf-8')
10
 
11
- def _header_and_download(self, header, data, file_name, key=None, label="Download", help="Download data"):
 
 
12
  col1, col2 = st.columns([9, 2])
13
  with col1:
14
  st.subheader(header)
@@ -16,8 +19,8 @@ class DataViewer:
16
  st.write("")
17
  st.download_button(
18
  label=label,
19
- data=self._convert_df_to_csv(data),
20
  file_name=file_name,
21
  key=key,
22
- help=help
23
  )
 
1
+ """
2
+ Class for showing header and download button in the same row.
3
+ """
4
 
5
  import streamlit as st
6
 
7
 
8
  class DataViewer:
 
 
9
  def _convert_df_to_csv(self, data, **kwargs):
10
+ return data.to_csv(**kwargs).encode("utf-8")
11
 
12
+ def _header_and_download(
13
+ self, header, data, file_name, key=None, label="Download", help="Download data"
14
+ ):
15
  col1, col2 = st.columns([9, 2])
16
  with col1:
17
  st.subheader(header)
 
19
  st.write("")
20
  st.download_button(
21
  label=label,
22
+ data=self._convert_df_to_csv(data, index=False),
23
  file_name=file_name,
24
  key=key,
25
+ help=help,
26
  )
process_text.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The text processing functionality.
3
+ """
4
+
5
+ from typing import List, Optional
6
+ import streamlit as st
7
+ import pandas as pd
8
+ import textdescriptives as td
9
+
10
+
11
+ @st.cache_data
12
+ def text_to_metrics(
13
+ string: str,
14
+ language_short: str,
15
+ model_size_short: str,
16
+ metrics: List[str],
17
+ split_by_line: bool,
18
+ filename: Optional[str],
19
+ ) -> pd.DataFrame:
20
+ # Clean and (optionally) split the text
21
+ string = string.strip()
22
+ if split_by_line:
23
+ strings = string.split("\n")
24
+ else:
25
+ strings = [string]
26
+
27
+ # Remove empty strings
28
+ # E.g. due to consecutive newlines
29
+ strings = [s for s in strings if s]
30
+
31
+ # Will automatically download the relevant model and extract all metrics
32
+ # TODO: Download beforehand to speed up inference
33
+ df = td.extract_metrics(
34
+ text=strings,
35
+ lang=language_short,
36
+ spacy_model_size=model_size_short,
37
+ metrics=metrics,
38
+ )
39
+
40
+ # Add filename
41
+ if filename is not None:
42
+ df["File"] = filename
43
+ move_column_inplace(df=df, col="File", pos=0)
44
+
45
+ return df
46
+
47
+
48
+ def move_column_inplace(df: pd.DataFrame, col: str, pos: int) -> None:
49
+ """
50
+ Move a column to a given column-index position.
51
+
52
+ Taken from the `utipy` package.
53
+
54
+ Parameters
55
+ ----------
56
+ df : `pandas.DataFrame`.
57
+ col : str
58
+ Name of column to move.
59
+ pos : int
60
+ Column index to move `col` to.
61
+ """
62
+ assert (
63
+ 0 <= pos < len(df.columns)
64
+ ), f"`pos` must be between 0 (incl.) and the number of columns -1. Was {pos}."
65
+ col = df.pop(col)
66
+ df.insert(pos, col.name, col)