Spaces:
Sleeping
Sleeping
Initial Draft
Browse files
app.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
import requests
|
|
|
4 |
|
5 |
from langchain_community.document_loaders import PyPDFLoader
|
|
|
6 |
from langchain.text_splitter import CharacterTextSplitter
|
7 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
8 |
from langchain.vectorstores.faiss import FAISS
|
@@ -59,6 +61,26 @@ def fn_create_vector_db(mv_pdf_input_file, mv_processing_message):
|
|
59 |
# -- Loading PDF Data
|
60 |
lv_pdf_loader = PyPDFLoader(lv_temp_pdf_file_path)
|
61 |
lv_pdf_content = lv_pdf_loader.load()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
print("Step2: PDF content extracted")
|
63 |
fn_display_user_messages("Step2: PDF content extracted", "Info", mv_processing_message)
|
64 |
|
@@ -69,7 +91,7 @@ def fn_create_vector_db(mv_pdf_input_file, mv_processing_message):
|
|
69 |
chunk_overlap=30,
|
70 |
length_function=len
|
71 |
)
|
72 |
-
lv_pdf_chunk_documents = lv_text_splitter.split_documents(
|
73 |
print("Step3: PDF content chucked and document object created")
|
74 |
fn_display_user_messages("Step3: PDF content chucked and document object created", "Info", mv_processing_message)
|
75 |
|
@@ -166,32 +188,46 @@ def fn_generate_QnA_response(mv_selected_model, mv_user_question, lv_vector_stor
|
|
166 |
elif mv_selected_model == 'mistralai/Mistral-7B-Instruct-v0.2':
|
167 |
lv_model_path = "model/mistral-7b-instruct-v0.2.Q2_K.gguf"
|
168 |
lv_model_type = "mistral"
|
169 |
-
|
170 |
-
|
171 |
print("Step4: Generating LLM response")
|
172 |
fn_display_user_messages("Step4: Generating LLM response","Info", mv_processing_message)
|
173 |
|
174 |
lv_model = LlamaCpp(
|
175 |
model_path=lv_model_path,
|
176 |
-
temperature=0.
|
177 |
max_tokens=2048,
|
178 |
top_p=1,
|
|
|
179 |
verbose=False
|
180 |
)
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
print("Step5: LLM response generated")
|
192 |
fn_display_user_messages("Step5: LLM response generated","Info", mv_processing_message)
|
193 |
|
194 |
-
return
|
195 |
|
196 |
# Main Function
|
197 |
def main():
|
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
import requests
|
4 |
+
import re
|
5 |
|
6 |
from langchain_community.document_loaders import PyPDFLoader
|
7 |
+
from langchain.docstore.document import Document
|
8 |
from langchain.text_splitter import CharacterTextSplitter
|
9 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
10 |
from langchain.vectorstores.faiss import FAISS
|
|
|
61 |
# -- Loading PDF Data
|
62 |
lv_pdf_loader = PyPDFLoader(lv_temp_pdf_file_path)
|
63 |
lv_pdf_content = lv_pdf_loader.load()
|
64 |
+
|
65 |
+
# -- Define patterns with flexibility
|
66 |
+
pattern1 = r"(\w+)-\n(\w+)" # Match hyphenated words separated by a line break
|
67 |
+
pattern2 = r"(?<!\n\s)\n(?!\s\n)" # Match line breaks not surrounded by whitespace
|
68 |
+
pattern3 = r"\n\s*\n" # Match multiple line breaks with optional whitespace
|
69 |
+
|
70 |
+
lv_pdf_formatted_content = []
|
71 |
+
for lv_page in lv_pdf_content:
|
72 |
+
# -- Apply substitutions with flexibility
|
73 |
+
lv_pdf_page_content = re.sub(pattern1, r"\1\2", lv_page.page_content)
|
74 |
+
lv_pdf_page_content = re.sub(pattern2, " ", lv_pdf_page_content.strip())
|
75 |
+
lv_pdf_page_content = re.sub(pattern3, " ", lv_pdf_page_content)
|
76 |
+
lv_pdf_page_content = re.sub("\n", " ", lv_pdf_page_content)
|
77 |
+
|
78 |
+
lv_pdf_formatted_content.append(Document( page_content= lv_pdf_page_content,
|
79 |
+
metadata= lv_page.metadata)
|
80 |
+
)
|
81 |
+
|
82 |
+
print("Page Details of "+str(lv_page.metadata)+" is - "+lv_pdf_page_content)
|
83 |
+
|
84 |
print("Step2: PDF content extracted")
|
85 |
fn_display_user_messages("Step2: PDF content extracted", "Info", mv_processing_message)
|
86 |
|
|
|
91 |
chunk_overlap=30,
|
92 |
length_function=len
|
93 |
)
|
94 |
+
lv_pdf_chunk_documents = lv_text_splitter.split_documents(lv_pdf_formatted_content)
|
95 |
print("Step3: PDF content chucked and document object created")
|
96 |
fn_display_user_messages("Step3: PDF content chucked and document object created", "Info", mv_processing_message)
|
97 |
|
|
|
188 |
elif mv_selected_model == 'mistralai/Mistral-7B-Instruct-v0.2':
|
189 |
lv_model_path = "model/mistral-7b-instruct-v0.2.Q2_K.gguf"
|
190 |
lv_model_type = "mistral"
|
191 |
+
|
|
|
192 |
print("Step4: Generating LLM response")
|
193 |
fn_display_user_messages("Step4: Generating LLM response","Info", mv_processing_message)
|
194 |
|
195 |
lv_model = LlamaCpp(
|
196 |
model_path=lv_model_path,
|
197 |
+
temperature=0.00,
|
198 |
max_tokens=2048,
|
199 |
top_p=1,
|
200 |
+
n_ctx=2048,
|
201 |
verbose=False
|
202 |
)
|
203 |
+
lv_vector_search_result = lv_vector_store.similarity_search(mv_user_question, k=2)
|
204 |
+
# print("Vector Search Result - ")
|
205 |
+
# print(lv_vector_search_result)
|
206 |
+
|
207 |
+
# -- Creating formatted document result
|
208 |
+
lv_document_context = ""
|
209 |
+
lv_count = 0
|
210 |
+
for lv_result in lv_vector_search_result:
|
211 |
+
print("Concatenating Result of page - " + str(lv_count) + " with content of document page no - "+str(lv_result.metadata["page"]))
|
212 |
+
lv_document_context += lv_result.page_content
|
213 |
+
lv_count += 1
|
214 |
+
|
215 |
+
# print("Formatted Document Search Result - ")
|
216 |
+
# print(lv_document_context)
|
217 |
+
|
218 |
+
lv_qa_formatted_prompt = lv_qa_prompt.format(
|
219 |
+
question=mv_user_question,
|
220 |
+
context=lv_document_context
|
221 |
+
)
|
222 |
+
print("Formatted Prompt - " + lv_qa_formatted_prompt)
|
223 |
+
|
224 |
+
lv_llm_response = lv_model(lv_qa_formatted_prompt)
|
225 |
+
# print("LLM Response" +lv_llm_response)
|
226 |
|
227 |
print("Step5: LLM response generated")
|
228 |
fn_display_user_messages("Step5: LLM response generated","Info", mv_processing_message)
|
229 |
|
230 |
+
return lv_llm_response
|
231 |
|
232 |
# Main Function
|
233 |
def main():
|