Spaces:
Runtime error
Runtime error
ugmSorcero
commited on
Commit
·
4107940
1
Parent(s):
d36f6ee
Adds file support (txt, pdf, csv)
Browse files- interface/components.py +25 -1
- interface/pages.py +2 -0
- interface/utils.py +60 -0
- requirements.txt +2 -1
interface/components.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import streamlit as st
|
2 |
-
from interface.utils import get_pipelines, extract_text_from_url
|
3 |
from interface.draw_pipelines import get_pipeline_graph
|
4 |
|
5 |
|
@@ -84,3 +84,27 @@ def component_article_url(container):
|
|
84 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
|
85 |
]
|
86 |
return corpus
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file
|
3 |
from interface.draw_pipelines import get_pipeline_graph
|
4 |
|
5 |
|
|
|
84 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
|
85 |
]
|
86 |
return corpus
|
87 |
+
|
88 |
+
|
89 |
+
def component_file_input(container):
|
90 |
+
"""Draw the extract text from file widget"""
|
91 |
+
with container:
|
92 |
+
files = []
|
93 |
+
doc_id = 1
|
94 |
+
with st.expander("Enter Files"):
|
95 |
+
while True:
|
96 |
+
file = st.file_uploader("Upload a .txt, .pdf, .csv file", key=doc_id)
|
97 |
+
if file != None:
|
98 |
+
extracted_text = extract_text_from_file(file)
|
99 |
+
if extracted_text != None:
|
100 |
+
files.append({"text": extracted_text})
|
101 |
+
doc_id += 1
|
102 |
+
st.markdown("---")
|
103 |
+
else:
|
104 |
+
break
|
105 |
+
else:
|
106 |
+
break
|
107 |
+
corpus = [
|
108 |
+
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
|
109 |
+
]
|
110 |
+
return corpus
|
interface/pages.py
CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
|
|
2 |
from streamlit_option_menu import option_menu
|
3 |
from core.search_index import index, search
|
4 |
from interface.components import (
|
|
|
5 |
component_show_pipeline,
|
6 |
component_show_search_result,
|
7 |
component_text_input,
|
@@ -59,6 +60,7 @@ def page_index(container):
|
|
59 |
input_funcs = {
|
60 |
"Raw Text": (component_text_input, "card-text"),
|
61 |
"URL": (component_article_url, "card-link"),
|
|
|
62 |
}
|
63 |
selected_input = option_menu(
|
64 |
"Input Text",
|
|
|
2 |
from streamlit_option_menu import option_menu
|
3 |
from core.search_index import index, search
|
4 |
from interface.components import (
|
5 |
+
component_file_input,
|
6 |
component_show_pipeline,
|
7 |
component_show_search_result,
|
8 |
component_text_input,
|
|
|
60 |
input_funcs = {
|
61 |
"Raw Text": (component_text_input, "card-text"),
|
62 |
"URL": (component_article_url, "card-link"),
|
63 |
+
"File": (component_file_input, "card-file"),
|
64 |
}
|
65 |
selected_input = option_menu(
|
66 |
"Input Text",
|
interface/utils.py
CHANGED
@@ -1,7 +1,10 @@
|
|
|
|
1 |
import core.pipelines as pipelines_functions
|
2 |
from inspect import getmembers, isfunction
|
3 |
from newspaper import Article
|
|
|
4 |
import streamlit as st
|
|
|
5 |
|
6 |
|
7 |
def get_pipelines():
|
@@ -21,3 +24,60 @@ def extract_text_from_url(url: str):
|
|
21 |
article.parse()
|
22 |
|
23 |
return article.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from io import StringIO
|
2 |
import core.pipelines as pipelines_functions
|
3 |
from inspect import getmembers, isfunction
|
4 |
from newspaper import Article
|
5 |
+
from PyPDF2 import PdfFileReader
|
6 |
import streamlit as st
|
7 |
+
import pandas as pd
|
8 |
|
9 |
|
10 |
def get_pipelines():
|
|
|
24 |
article.parse()
|
25 |
|
26 |
return article.text
|
27 |
+
|
28 |
+
|
29 |
+
def extract_text_from_file(file):
|
30 |
+
# read text file
|
31 |
+
if file.type == "text/plain":
|
32 |
+
# To convert to a string based IO:
|
33 |
+
stringio = StringIO(file.getvalue().decode("utf-8"))
|
34 |
+
|
35 |
+
# To read file as string:
|
36 |
+
file_text = stringio.read()
|
37 |
+
|
38 |
+
return file_text
|
39 |
+
|
40 |
+
# read pdf file
|
41 |
+
elif file.type == "application/pdf":
|
42 |
+
pdfReader = PdfFileReader(file)
|
43 |
+
count = pdfReader.numPages
|
44 |
+
all_text = ""
|
45 |
+
|
46 |
+
for i in range(count):
|
47 |
+
try:
|
48 |
+
page = pdfReader.getPage(i)
|
49 |
+
all_text += page.extractText()
|
50 |
+
except:
|
51 |
+
continue
|
52 |
+
file_text = all_text
|
53 |
+
|
54 |
+
return file_text
|
55 |
+
|
56 |
+
# read csv file
|
57 |
+
elif file.type == "text/csv":
|
58 |
+
csv = pd.read_csv(file)
|
59 |
+
# get columns of type string
|
60 |
+
string_columns = csv.select_dtypes(include=['object']).columns
|
61 |
+
# get data from columns and join it together
|
62 |
+
file_text = ""
|
63 |
+
for row in csv[string_columns].values.tolist():
|
64 |
+
# remove NaNs
|
65 |
+
row = [x for x in row if str(x) != 'nan']
|
66 |
+
for column in row:
|
67 |
+
txt = ""
|
68 |
+
if isinstance(column, list):
|
69 |
+
try:
|
70 |
+
txt = " ".join(column)
|
71 |
+
except:
|
72 |
+
continue
|
73 |
+
elif isinstance(column, str):
|
74 |
+
txt = column
|
75 |
+
else:
|
76 |
+
continue
|
77 |
+
file_text += " " + txt
|
78 |
+
return file_text
|
79 |
+
|
80 |
+
else:
|
81 |
+
st.warning(f"File type {file.type} not supported")
|
82 |
+
return None
|
83 |
+
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ streamlit_option_menu==0.3.2
|
|
3 |
farm-haystack==1.8.0
|
4 |
black==22.8.0
|
5 |
plotly==5.10.0
|
6 |
-
newspaper3k==0.2.8
|
|
|
|
3 |
farm-haystack==1.8.0
|
4 |
black==22.8.0
|
5 |
plotly==5.10.0
|
6 |
+
newspaper3k==0.2.8
|
7 |
+
PyPDF2==2.10.7
|