Spaces:
Running
Running
File size: 8,573 Bytes
40c6322 0b94a7f 40c6322 ed033e8 fe6f1d7 40c6322 5154a59 40c6322 5154a59 40c6322 8a81b94 e3706e0 4ef7de6 e3706e0 750111a 505522b 5d6daad 505522b 750111a 40c6322 ed033e8 40c6322 ed033e8 fe6f1d7 4c97480 40c6322 ed033e8 ae2f761 b8672c4 366b039 6ac0564 ae2f761 40c6322 b8672c4 366b039 e3706e0 366b039 b8672c4 dc9a4f7 579087e 6ac0564 94edff4 6ac0564 579087e 6ac0564 579087e 40c6322 94edff4 40c6322 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import streamlit as st
import re
from transformers import pipeline
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import tempfile
import pytesseract
import PyPDF2
from pdf2image import convert_from_path
from PIL import Image
st.set_page_config(page_title="Automated Question Answering System") # set page title
# heading
st.markdown("<h2 style='text-align: center;'>Question Answering on Academic Essays</h2>", unsafe_allow_html=True)
# description
st.markdown("<h3 style='text-align: left; color:#F63366; font-size:18px;'><b>What is this project about?<b></h3>", unsafe_allow_html=True)
st.write("This project is to develop a web-based automated question-and-answer system for academic essays using natural language processing (NLP). Users can enter the essay and ask questions about it, and the system will automatically create answers.")
st.write("π Click 'Input Text' or 'Upload File' to start experience the system. ")
# store the model in cache resources to enhance efficiency (ref: https://docs.streamlit.io/library/advanced-features/caching)
@st.cache_resource(show_spinner=False)
def question_model():
# call my model for question answering
with st.spinner(text="Loading question model..."):
model_name = "kxx-kkk/FYP_qa_final"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, handle_impossible_answer=True)
print("QA model is dowloaded and ready to use")
return question_answerer
qamodel = question_model()
@st.cache_data(show_spinner=False)
def extract_text(file_path):
text = ""
image_text = ""
with st.spinner(text="Extracting text from file..."):
with open(file_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
for page_number in range(num_pages):
# st.write(f"Page {page_number + 1}")
page = pdf_reader.pages[page_number]
text += page.extract_text()
images = convert_from_path(file_path) # Convert PDF pages to images
for i, image in enumerate(images):
image_text += pytesseract.image_to_string(image)
# text = text + image_text
text = image_text
# remove more than one new line
text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
return text
# get the answer by passing the context & question to the model
def question_answering(context, question):
with st.spinner(text="Loading question model..."):
question_answerer = qamodel
with st.spinner(text="Getting answer..."):
answer = question_answerer(context=context, question=question)
print(answer)
answer_score = str(answer["score"])
answer = answer["answer"]
if (answer==""):
answer = "CANNOT ANSWER"
# display the result in container
container = st.container(border=True)
container.write("<h5><b>Answer:</b></h5>"+answer+"<p><small>(F1 score: "+answer_score+")</small></p><br>", unsafe_allow_html=True)
# def question_answering(context, question):
# with st.spinner(text="Loading question model..."):
# question_answerer = qamodel
# print("loading QA model...")
# with st.spinner(text="Getting answer..."):
# segment_size = 512
# overlap_size = 32
# text_length = len(context)
# segments = []
# # Split context into segments
# for i in range(0, text_length, segment_size - overlap_size):
# segment_start = i
# segment_end = i + segment_size
# segment = context[segment_start:segment_end]
# segments.append(segment)
# answers = {} # Dictionary to store answers for each segment
# # Get answers for each segment
# for i, segment in enumerate(segments):
# answer = question_answerer(context=segment, question=question)
# answers[i] = answer
# # Find the answer with the highest score
# highest_score = -1
# highest_answer = None
# for segment_index, answer in answers.items():
# print(answer)
# score = answer["score"]
# if score > highest_score:
# highest_score = score
# highest_answer = answer
# if highest_answer is not None:
# answer = highest_answer["answer"]
# if answer == "":
# answer = "CANNOT ANSWER"
# answer_score = str(highest_answer["score"])
# # Display the result in container
# container = st.container(border=True)
# container.write("<h5><b>Answer:</b></h5>" + answer + "<p><small>(F1 score: " + answer_score + ")</small></p><br>",
# unsafe_allow_html=True)
#-------------------- Main Webpage --------------------
# choose the source with different tabs
tab1, tab2 = st.tabs(["Input Text", "Upload File"])
#---------- input text ----------
# if type the text as input
with tab1:
# set the example
sample_question = "What is NLP?"
with open("sample.txt", "r") as text_file:
sample_text = text_file.read()
# Get the initial values of context and question
context = st.session_state.get("contextInput", "")
question = st.session_state.get("questionInput", "")
# Button to try the example
example = st.button("Try with example")
# Update the values if the "Try with example" button is clicked
if example:
context = sample_text
question = sample_question
# Display the text area and text input with the updated or default values
context = st.text_area("Enter the essay below:", value=context, key="contextInput", height=330)
question = st.text_input(label="Enter the question: ", value=question, key="questionInput")
# perform question answering when "get answer" button clicked
button = st.button("Get answer", key="textInput", type="primary")
if button:
if context=="" or question=="":
st.error ("Please enter BOTH the context and the question", icon="π¨")
else:
question_answering(context, question)
# ---------- upload file ----------
# if upload file as input
with tab2:
# provide upload place
uploaded_file = st.file_uploader("Upload essay in PDF format:", type=["pdf"])
# Create a session-level variable to track the uploaded file
if 'file' not in st.session_state:
st.session_state.file = None
# Create a session-level variable to track if text extraction has been done
if 'text_extracted' not in st.session_state:
st.session_state.text_extracted = False
# Get the initial values of context and question
context2 = st.session_state.get("contextInput2", "")
question2 = st.session_state.get("questionInput2", "")
# transfer file to context and allow ask question, then perform question answering
if uploaded_file is not None:
if st.session_state.file != uploaded_file:
# Update the session state with the new file
st.session_state.file = uploaded_file
st.session_state.text_extracted = False
if not st.session_state.text_extracted:
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(uploaded_file.read()) # Save uploaded file to a temporary path
raw_text = extract_text(temp_file.name)
context2 = raw_text
st.session_state.text_extracted = True
question2 = st.text_input(label="Enter your question",value=question2, key="questionInput2")
context2 = st.text_area("Your essay context: ", value=context2, height=330, key="contextInput2")
# perform question answering when "get answer" button clicked
button2 = st.button("Get answer", key="fileInput", type="primary")
if button2:
if context2=="" or question2=="":
st.error ("Please enter BOTH the context and the question", icon="π¨")
else:
question_answering(context2, question2)
st.markdown("<p style='text-align:center;'>Β© 20069913D HUI Man Ki - Final Year Project</p>", unsafe_allow_html=True)
st.markdown("<br><br><br><br><br>", unsafe_allow_html=True)
|