File size: 3,914 Bytes
a00136a
978d20f
febd687
978d20f
 
a00136a
978d20f
a00136a
8785aa0
1225077
 
 
a00136a
54a2185
664b6f4
07ecaec
a00136a
cef4abb
d83001b
978d20f
 
 
 
a00136a
cef4abb
978d20f
febd687
cef4abb
290d0b9
978d20f
2ed030b
cef4abb
2ed030b
 
07ecaec
cef4abb
2ed030b
d2f95a7
07ecaec
 
2ed030b
 
 
664b6f4
e92de57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cef4abb
e92de57
 
 
 
 
 
 
 
1225077
e92de57
978d20f
 
 
 
 
 
a00136a
978d20f
 
 
 
 
 
 
 
 
 
 
 
 
a00136a
978d20f
a00136a
978d20f
a00136a
2c2e9c9
1225077
ac0a4b5
978d20f
4b928c7
 
 
a00136a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import streamlit as st
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_nomic.embeddings import NomicEmbeddings

from langchain_community.llms import HuggingFaceHub

from bs4 import BeautifulSoup
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import hub
from sentence_transformers import SentenceTransformer

# Convert string of URLs to list
def method_get_website_text(urls):
    urls_list = urls.split("\n")
    docs = [WebBaseLoader(url).load() for url in urls_list]
    docs_list = [item for sublist in docs for item in sublist]
    return docs_list

#split the text into chunks
def method_get_text_chunks(text):
    #text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
    doc_splits = text_splitter.split_documents(text)
    return doc_splits

#convert text chunks into embeddings and store in vector database
def method_get_vectorstore(document_chunks):
    # create the open-source embedding function
    #embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
    #embeddings = HuggingFaceEmbeddings()

    model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5",trust_remote_code=True)
    embeddings = model.encode()
    
    # create a vectorstore from the chunks
    vector_store = Chroma.from_documents(document_chunks, embeddings)
    return vector_store

    
def get_context_retriever_chain(vector_store,question):
    # Initialize the retriever
    retriever = vector_store.as_retriever()
    
    # Define the RAG template
    after_rag_template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    """
    
    # Create the RAG prompt template
    after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
    
    # Initialize the Hugging Face language model (LLM)
    llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":1024})
    
    # Construct the RAG pipeline
    after_rag_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | after_rag_prompt
        | llm
        | StrOutputParser()
    )
    
    return after_rag_chain.invoke(question)

def main():
    st.set_page_config(page_title="Chat with websites", page_icon="🤖")
    st.title("Chat with websites")
    
    # sidebar
    with st.sidebar:
        st.header("Settings")
        website_url = st.text_input("Website URL")
    
    if website_url is None or website_url == "":
        st.info("Please enter a website URL")
    
    else:
        # Input fields
        question = st.text_input("Question")
        
        # Button to process input
        if st.button('Query Documents'):
            with st.spinner('Processing...'):
                # get pdf text
                raw_text = method_get_website_text(website_url)
                # get the text chunks
                doc_splits = method_get_text_chunks(raw_text)
                # create vector store
                vector_store = method_get_vectorstore(doc_splits)
                # Generate response using the RAG pipeline
                answer = get_context_retriever_chain(vector_store,question)
                # Display the generated answer
                split_string = "Question: " + str(question)
                result = answer.split(split_string)[-1]
                st.text_area("Answer", value=result, height=300, disabled=True)

if __name__ == '__main__':
    main()