rahgadda commited on
Commit
24457db
·
verified ·
1 Parent(s): 863d81b

Initial Draft

Browse files
Files changed (1) hide show
  1. app.py +50 -14
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import streamlit as st
2
  import os
3
  import requests
 
4
 
5
  from langchain_community.document_loaders import PyPDFLoader
 
6
  from langchain.text_splitter import CharacterTextSplitter
7
  from langchain_community.embeddings import HuggingFaceEmbeddings
8
  from langchain.vectorstores.faiss import FAISS
@@ -59,6 +61,26 @@ def fn_create_vector_db(mv_pdf_input_file, mv_processing_message):
59
  # -- Loading PDF Data
60
  lv_pdf_loader = PyPDFLoader(lv_temp_pdf_file_path)
61
  lv_pdf_content = lv_pdf_loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  print("Step2: PDF content extracted")
63
  fn_display_user_messages("Step2: PDF content extracted", "Info", mv_processing_message)
64
 
@@ -69,7 +91,7 @@ def fn_create_vector_db(mv_pdf_input_file, mv_processing_message):
69
  chunk_overlap=30,
70
  length_function=len
71
  )
72
- lv_pdf_chunk_documents = lv_text_splitter.split_documents(lv_pdf_content)
73
  print("Step3: PDF content chucked and document object created")
74
  fn_display_user_messages("Step3: PDF content chucked and document object created", "Info", mv_processing_message)
75
 
@@ -166,32 +188,46 @@ def fn_generate_QnA_response(mv_selected_model, mv_user_question, lv_vector_stor
166
  elif mv_selected_model == 'mistralai/Mistral-7B-Instruct-v0.2':
167
  lv_model_path = "model/mistral-7b-instruct-v0.2.Q2_K.gguf"
168
  lv_model_type = "mistral"
169
-
170
-
171
  print("Step4: Generating LLM response")
172
  fn_display_user_messages("Step4: Generating LLM response","Info", mv_processing_message)
173
 
174
  lv_model = LlamaCpp(
175
  model_path=lv_model_path,
176
- temperature=0.75,
177
  max_tokens=2048,
178
  top_p=1,
 
179
  verbose=False
180
  )
181
- lv_retriever = lv_vector_store.as_retriever(search_kwargs={'k': 2})
182
- lv_qa_chain = RetrievalQA.from_chain_type( llm=lv_model,
183
- chain_type='stuff',
184
- retriever=lv_retriever,
185
- return_source_documents=True,
186
- chain_type_kwargs={'prompt': lv_qa_prompt}
187
- )
188
-
189
- lv_response = lv_qa_chain({"query": mv_user_question})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  print("Step5: LLM response generated")
192
  fn_display_user_messages("Step5: LLM response generated","Info", mv_processing_message)
193
 
194
- return lv_response['result']
195
 
196
  # Main Function
197
  def main():
 
1
  import streamlit as st
2
  import os
3
  import requests
4
+ import re
5
 
6
  from langchain_community.document_loaders import PyPDFLoader
7
+ from langchain.docstore.document import Document
8
  from langchain.text_splitter import CharacterTextSplitter
9
  from langchain_community.embeddings import HuggingFaceEmbeddings
10
  from langchain.vectorstores.faiss import FAISS
 
61
  # -- Loading PDF Data
62
  lv_pdf_loader = PyPDFLoader(lv_temp_pdf_file_path)
63
  lv_pdf_content = lv_pdf_loader.load()
64
+
65
+ # -- Define patterns with flexibility
66
+ pattern1 = r"(\w+)-\n(\w+)" # Match hyphenated words separated by a line break
67
+ pattern2 = r"(?<!\n\s)\n(?!\s\n)" # Match line breaks not surrounded by whitespace
68
+ pattern3 = r"\n\s*\n" # Match multiple line breaks with optional whitespace
69
+
70
+ lv_pdf_formatted_content = []
71
+ for lv_page in lv_pdf_content:
72
+ # -- Apply substitutions with flexibility
73
+ lv_pdf_page_content = re.sub(pattern1, r"\1\2", lv_page.page_content)
74
+ lv_pdf_page_content = re.sub(pattern2, " ", lv_pdf_page_content.strip())
75
+ lv_pdf_page_content = re.sub(pattern3, " ", lv_pdf_page_content)
76
+ lv_pdf_page_content = re.sub("\n", " ", lv_pdf_page_content)
77
+
78
+ lv_pdf_formatted_content.append(Document( page_content= lv_pdf_page_content,
79
+ metadata= lv_page.metadata)
80
+ )
81
+
82
+ print("Page Details of "+str(lv_page.metadata)+" is - "+lv_pdf_page_content)
83
+
84
  print("Step2: PDF content extracted")
85
  fn_display_user_messages("Step2: PDF content extracted", "Info", mv_processing_message)
86
 
 
91
  chunk_overlap=30,
92
  length_function=len
93
  )
94
+ lv_pdf_chunk_documents = lv_text_splitter.split_documents(lv_pdf_formatted_content)
95
  print("Step3: PDF content chucked and document object created")
96
  fn_display_user_messages("Step3: PDF content chucked and document object created", "Info", mv_processing_message)
97
 
 
188
  elif mv_selected_model == 'mistralai/Mistral-7B-Instruct-v0.2':
189
  lv_model_path = "model/mistral-7b-instruct-v0.2.Q2_K.gguf"
190
  lv_model_type = "mistral"
191
+
 
192
  print("Step4: Generating LLM response")
193
  fn_display_user_messages("Step4: Generating LLM response","Info", mv_processing_message)
194
 
195
  lv_model = LlamaCpp(
196
  model_path=lv_model_path,
197
+ temperature=0.00,
198
  max_tokens=2048,
199
  top_p=1,
200
+ n_ctx=2048,
201
  verbose=False
202
  )
203
+ lv_vector_search_result = lv_vector_store.similarity_search(mv_user_question, k=2)
204
+ # print("Vector Search Result - ")
205
+ # print(lv_vector_search_result)
206
+
207
+ # -- Creating formatted document result
208
+ lv_document_context = ""
209
+ lv_count = 0
210
+ for lv_result in lv_vector_search_result:
211
+ print("Concatenating Result of page - " + str(lv_count) + " with content of document page no - "+str(lv_result.metadata["page"]))
212
+ lv_document_context += lv_result.page_content
213
+ lv_count += 1
214
+
215
+ # print("Formatted Document Search Result - ")
216
+ # print(lv_document_context)
217
+
218
+ lv_qa_formatted_prompt = lv_qa_prompt.format(
219
+ question=mv_user_question,
220
+ context=lv_document_context
221
+ )
222
+ print("Formatted Prompt - " + lv_qa_formatted_prompt)
223
+
224
+ lv_llm_response = lv_model(lv_qa_formatted_prompt)
225
+ # print("LLM Response" +lv_llm_response)
226
 
227
  print("Step5: LLM response generated")
228
  fn_display_user_messages("Step5: LLM response generated","Info", mv_processing_message)
229
 
230
+ return lv_llm_response
231
 
232
  # Main Function
233
  def main():