Paul-Joshi commited on
Commit
cef4abb
·
verified ·
1 Parent(s): 4b928c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -30
app.py CHANGED
@@ -14,53 +14,30 @@ from langchain_core.prompts import ChatPromptTemplate
14
  from langchain_community.embeddings import HuggingFaceEmbeddings
15
  from langchain import hub
16
 
 
17
  def method_get_website_text(urls):
18
- # Convert string of URLs to list
19
-
20
  urls_list = urls.split("\n")
21
  docs = [WebBaseLoader(url).load() for url in urls_list]
22
  docs_list = [item for sublist in docs for item in sublist]
23
  return docs_list
24
 
25
-
26
  def method_get_text_chunks(text):
27
- #split the text into chunks
28
-
29
  #text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
30
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
31
  doc_splits = text_splitter.split_documents(text)
32
  return doc_splits
33
 
 
34
  def method_get_vectorstore(document_chunks):
35
- #convert text chunks into embeddings and store in vector database
36
-
37
  # create the open-source embedding function
38
- #embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
39
- embeddings = HuggingFaceEmbeddings()
40
 
41
  # create a vectorstore from the chunks
42
  vector_store = Chroma.from_documents(document_chunks, embeddings)
43
  return vector_store
44
 
45
- # def get_context_retriever_chain(vector_store,question):
46
- # # Initialize the retriever
47
- # retriever = vector_store.as_retriever()
48
-
49
- # # Define the RAG template and RAG prompt template
50
- # prompt = hub.pull("rlm/rag-prompt")
51
-
52
- # # Initialize the Hugging Face language model (LLM)
53
- # llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":512})
54
-
55
- # # Construct the RAG pipeline
56
- # rag_chain = (
57
- # {"context": retriever, "question": RunnablePassthrough()}
58
- # | prompt
59
- # | llm
60
- # | StrOutputParser()
61
- # )
62
- # return rag_chain.invoke(str(question))
63
-
64
 
65
  def get_context_retriever_chain(vector_store,question):
66
  # Initialize the retriever
@@ -76,7 +53,7 @@ def get_context_retriever_chain(vector_store,question):
76
  after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
77
 
78
  # Initialize the Hugging Face language model (LLM)
79
- llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":512})
80
 
81
  # Construct the RAG pipeline
82
  after_rag_chain = (
 
14
  from langchain_community.embeddings import HuggingFaceEmbeddings
15
  from langchain import hub
16
 
17
+ # Convert string of URLs to list
18
  def method_get_website_text(urls):
 
 
19
  urls_list = urls.split("\n")
20
  docs = [WebBaseLoader(url).load() for url in urls_list]
21
  docs_list = [item for sublist in docs for item in sublist]
22
  return docs_list
23
 
24
+ #split the text into chunks
25
  def method_get_text_chunks(text):
 
 
26
  #text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
27
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
28
  doc_splits = text_splitter.split_documents(text)
29
  return doc_splits
30
 
31
+ #convert text chunks into embeddings and store in vector database
32
  def method_get_vectorstore(document_chunks):
 
 
33
  # create the open-source embedding function
34
+ embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
35
+ #embeddings = HuggingFaceEmbeddings()
36
 
37
  # create a vectorstore from the chunks
38
  vector_store = Chroma.from_documents(document_chunks, embeddings)
39
  return vector_store
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  def get_context_retriever_chain(vector_store,question):
43
  # Initialize the retriever
 
53
  after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
54
 
55
  # Initialize the Hugging Face language model (LLM)
56
+ llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":1024})
57
 
58
  # Construct the RAG pipeline
59
  after_rag_chain = (