File size: 8,573 Bytes
40c6322
0b94a7f
40c6322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed033e8
fe6f1d7
 
40c6322
 
5154a59
40c6322
 
5154a59
 
 
 
 
 
40c6322
 
8a81b94
 
e3706e0
 
 
 
 
4ef7de6
 
 
 
 
 
 
 
e3706e0
 
 
 
 
 
 
 
 
 
 
750111a
505522b
 
5d6daad
505522b
750111a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40c6322
 
 
 
ed033e8
40c6322
 
 
 
 
 
 
 
 
 
 
 
 
ed033e8
 
 
 
 
 
 
fe6f1d7
 
 
 
 
 
4c97480
40c6322
 
 
 
 
 
 
 
 
 
ed033e8
ae2f761
b8672c4
 
 
366b039
 
 
 
6ac0564
ae2f761
 
40c6322
 
 
b8672c4
 
 
 
 
366b039
 
 
e3706e0
 
366b039
b8672c4
dc9a4f7
579087e
 
6ac0564
 
94edff4
6ac0564
579087e
6ac0564
 
579087e
40c6322
94edff4
40c6322
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import streamlit as st
import re

from transformers import pipeline
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

import tempfile
import pytesseract
import PyPDF2
from pdf2image import convert_from_path
from PIL import Image


st.set_page_config(page_title="Automated Question Answering System")    # set page title

# heading 
st.markdown("<h2 style='text-align: center;'>Question Answering on Academic Essays</h2>", unsafe_allow_html=True)   
# description
st.markdown("<h3 style='text-align: left; color:#F63366; font-size:18px;'><b>What is this project about?<b></h3>", unsafe_allow_html=True)     
st.write("This project is to develop a web-based automated question-and-answer system for academic essays using natural language processing (NLP). Users can enter the essay and ask questions about it, and the system will automatically create answers.")
st.write("πŸ‘ Click 'Input Text' or 'Upload File' to start experience the system. ")

# store the model in cache resources to enhance efficiency (ref: https://docs.streamlit.io/library/advanced-features/caching)
@st.cache_resource(show_spinner=False)
def question_model():
    # call my model for question answering
    with st.spinner(text="Loading question model..."):
        model_name = "kxx-kkk/FYP_qa_final"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, handle_impossible_answer=True)
        print("QA model is dowloaded and ready to use")
    return question_answerer

qamodel = question_model()

@st.cache_data(show_spinner=False)
def extract_text(file_path):
    text = ""
    image_text = ""
    with st.spinner(text="Extracting text from file..."):
        with open(file_path, "rb") as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)

            for page_number in range(num_pages):
                # st.write(f"Page {page_number + 1}")
                page = pdf_reader.pages[page_number]
                text += page.extract_text()

        images = convert_from_path(file_path)  # Convert PDF pages to images
        for i, image in enumerate(images):
            image_text += pytesseract.image_to_string(image)
            
        # text = text + image_text
        text = image_text
        # remove more than one new line
        text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
    return text

# get the answer by passing the context & question to the model
def question_answering(context, question):
    with st.spinner(text="Loading question model..."):
        question_answerer = qamodel
    with st.spinner(text="Getting answer..."):
        answer = question_answerer(context=context, question=question)        
        print(answer)
        answer_score = str(answer["score"])
        answer = answer["answer"]
        if (answer==""):
            answer = "CANNOT ANSWER"

        # display the result in container
        container = st.container(border=True)
        container.write("<h5><b>Answer:</b></h5>"+answer+"<p><small>(F1 score: "+answer_score+")</small></p><br>", unsafe_allow_html=True)

# def question_answering(context, question):
#     with st.spinner(text="Loading question model..."):
#         question_answerer = qamodel
#         print("loading QA model...")
#     with st.spinner(text="Getting answer..."):
#         segment_size = 512   
#         overlap_size = 32
#         text_length = len(context)
#         segments = []

#         # Split context into segments
#         for i in range(0, text_length, segment_size - overlap_size):
#             segment_start = i
#             segment_end = i + segment_size
#             segment = context[segment_start:segment_end]
#             segments.append(segment)

#         answers = {}  # Dictionary to store answers for each segment

#         # Get answers for each segment
#         for i, segment in enumerate(segments):
#             answer = question_answerer(context=segment, question=question)
#             answers[i] = answer

#         # Find the answer with the highest score
#         highest_score = -1
#         highest_answer = None
#         for segment_index, answer in answers.items():
#             print(answer)
#             score = answer["score"]
#             if score > highest_score:
#                 highest_score = score
#                 highest_answer = answer

#         if highest_answer is not None:
#             answer = highest_answer["answer"]
#             if answer == "": 
#                 answer = "CANNOT ANSWER"
#             answer_score = str(highest_answer["score"])

#             # Display the result in container
#             container = st.container(border=True)
#             container.write("<h5><b>Answer:</b></h5>" + answer + "<p><small>(F1 score: " + answer_score + ")</small></p><br>",
#                             unsafe_allow_html=True)


#-------------------- Main Webpage  --------------------
# choose the source with different tabs
tab1, tab2 = st.tabs(["Input Text", "Upload File"])

#---------- input text ----------
# if type the text as input
with tab1:  
    # set the example  
    sample_question = "What is NLP?"
    with open("sample.txt", "r") as text_file:
        sample_text = text_file.read()

    # Get the initial values of context and question
    context = st.session_state.get("contextInput", "")
    question = st.session_state.get("questionInput", "")

    # Button to try the example
    example = st.button("Try with example")

    # Update the values if the "Try with example" button is clicked
    if example:
        context = sample_text
        question = sample_question

    # Display the text area and text input with the updated or default values
    context = st.text_area("Enter the essay below:", value=context, key="contextInput", height=330)
    question = st.text_input(label="Enter the question: ", value=question, key="questionInput")
    
    # perform question answering when "get answer" button clicked
    button = st.button("Get answer", key="textInput", type="primary")
    if button:
        if context=="" or question=="":
            st.error ("Please enter BOTH the context and the question", icon="🚨")
        else:
            question_answering(context, question)

# ---------- upload file ----------
# if upload file as input  
with tab2:
    # provide upload place
    uploaded_file = st.file_uploader("Upload essay in PDF format:", type=["pdf"])

    # Create a session-level variable to track the uploaded file
    if 'file' not in st.session_state:
        st.session_state.file = None
    # Create a session-level variable to track if text extraction has been done
    if 'text_extracted' not in st.session_state:
        st.session_state.text_extracted = False

    # Get the initial values of context and question
    context2 = st.session_state.get("contextInput2", "")
    question2 = st.session_state.get("questionInput2", "")

    # transfer file to context and allow ask question, then perform question answering 
    if uploaded_file is not None:
        if st.session_state.file != uploaded_file:
            # Update the session state with the new file
            st.session_state.file = uploaded_file
            st.session_state.text_extracted = False

        if not st.session_state.text_extracted:
            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                temp_file.write(uploaded_file.read())  # Save uploaded file to a temporary path                  
            raw_text = extract_text(temp_file.name)
            context2 = raw_text
            st.session_state.text_extracted = True
            

        question2 = st.text_input(label="Enter your question",value=question2, key="questionInput2")
        context2 = st.text_area("Your essay context: ", value=context2, height=330, key="contextInput2")

        # perform question answering when "get answer" button clicked
        button2 = st.button("Get answer", key="fileInput", type="primary")
        if button2:
            if context2=="" or question2=="":
                st.error ("Please enter BOTH the context and the question", icon="🚨")
            else:
                question_answering(context2, question2)

st.markdown("<p style='text-align:center;'>Β© 20069913D HUI Man Ki - Final Year Project</p>", unsafe_allow_html=True)
st.markdown("<br><br><br><br><br>", unsafe_allow_html=True)