add website option
Browse files- app.py +47 -23
- requirements.txt +1 -0
app.py
CHANGED
@@ -5,7 +5,7 @@ import tempfile
|
|
5 |
from langchain_openai import OpenAIEmbeddings
|
6 |
from langchain_openai.chat_models import ChatOpenAI
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
-
from langchain_community.document_loaders import PyPDFLoader
|
9 |
from langchain_community.document_loaders.generic import GenericLoader
|
10 |
from langchain_community.document_loaders.parsers import OpenAIWhisperParser
|
11 |
from langchain_community.document_loaders.blob_loaders.youtube_audio import (
|
@@ -13,7 +13,6 @@ from langchain_community.document_loaders.blob_loaders.youtube_audio import (
|
|
13 |
)
|
14 |
from langchain_community.vectorstores import Chroma
|
15 |
from langchain_core.messages import HumanMessage, AIMessage
|
16 |
-
from langchain_core.output_parsers import StrOutputParser
|
17 |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
18 |
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
|
19 |
from langchain.chains.combine_documents import create_stuff_documents_chain
|
@@ -25,13 +24,20 @@ st.set_page_config(page_title="Chat with your data", page_icon="🤖")
|
|
25 |
st.title("Chat with your data")
|
26 |
st.header("Add your data for RAG")
|
27 |
|
28 |
-
data_type = st.radio(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
if "vectordb" not in st.session_state:
|
31 |
st.session_state.vectordb = None
|
32 |
|
33 |
|
34 |
-
def
|
35 |
embeddings = OpenAIEmbeddings()
|
36 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
37 |
texts = text_splitter.split_text(text)
|
@@ -42,7 +48,7 @@ def add_text_to_chroma(text):
|
|
42 |
return vectordb
|
43 |
|
44 |
|
45 |
-
def
|
46 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
47 |
tmp_file.write(uploaded_pdf.read())
|
48 |
tmp_file_path = tmp_file.name
|
@@ -58,7 +64,19 @@ def add_pdf_to_chroma(uploaded_pdf):
|
|
58 |
return vectordb
|
59 |
|
60 |
|
61 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
save_dir = "docs/youtube"
|
63 |
loader = GenericLoader(
|
64 |
YoutubeAudioLoader([youtube_url], save_dir), OpenAIWhisperParser()
|
@@ -76,21 +94,23 @@ def add_youtube_to_chroma(youtube_url):
|
|
76 |
if data_type == "Text":
|
77 |
user_text = st.text_area("Enter text data")
|
78 |
if st.button("Add"):
|
79 |
-
st.session_state.vectordb =
|
80 |
|
81 |
elif data_type == "PDF":
|
82 |
uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
|
83 |
if st.button("Add"):
|
84 |
-
st.session_state.vectordb =
|
85 |
|
|
|
|
|
|
|
|
|
86 |
else:
|
87 |
youtube_url = st.text_input("Enter YouTube URL")
|
88 |
if st.button("Add"):
|
89 |
-
st.session_state.vectordb =
|
90 |
|
91 |
-
llm = ChatOpenAI(
|
92 |
-
api_key=openai_api_key, temperature=0.2, model="gpt-3.5-turbo"
|
93 |
-
)
|
94 |
|
95 |
|
96 |
def get_context_retreiver_chain(vectordb):
|
@@ -113,11 +133,16 @@ def get_context_retreiver_chain(vectordb):
|
|
113 |
|
114 |
|
115 |
def get_conversational_rag_chain(retriever_chain):
|
116 |
-
prompt = ChatPromptTemplate.from_messages(
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
stuff_domain_chain = create_stuff_documents_chain(llm, prompt)
|
123 |
|
@@ -127,16 +152,15 @@ def get_conversational_rag_chain(retriever_chain):
|
|
127 |
def get_response(user_input):
|
128 |
if st.session_state.vectordb is None:
|
129 |
return "Please add data first"
|
130 |
-
|
131 |
retrieveal_chain = get_context_retreiver_chain(st.session_state.vectordb)
|
132 |
converasational_rag_chain = get_conversational_rag_chain(retrieveal_chain)
|
133 |
|
134 |
-
response = converasational_rag_chain.invoke(
|
135 |
-
"chat_history": st.session_state.chat_history,
|
136 |
-
|
137 |
-
})
|
138 |
|
139 |
-
return response[
|
140 |
|
141 |
|
142 |
user_query = st.chat_input("Your message")
|
|
|
5 |
from langchain_openai import OpenAIEmbeddings
|
6 |
from langchain_openai.chat_models import ChatOpenAI
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
|
9 |
from langchain_community.document_loaders.generic import GenericLoader
|
10 |
from langchain_community.document_loaders.parsers import OpenAIWhisperParser
|
11 |
from langchain_community.document_loaders.blob_loaders.youtube_audio import (
|
|
|
13 |
)
|
14 |
from langchain_community.vectorstores import Chroma
|
15 |
from langchain_core.messages import HumanMessage, AIMessage
|
|
|
16 |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
17 |
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
|
18 |
from langchain.chains.combine_documents import create_stuff_documents_chain
|
|
|
24 |
st.title("Chat with your data")
|
25 |
st.header("Add your data for RAG")
|
26 |
|
27 |
+
data_type = st.radio(
|
28 |
+
"Choose the type of data to add:", ("Text", "PDF", "Website", "YouTube")
|
29 |
+
)
|
30 |
+
|
31 |
+
if data_type == "YouTube":
|
32 |
+
st.warning(
|
33 |
+
"Note: Processing YouTube videos can be quite costly for me in terms of money. Please use this option sparingly. Thank you for your understanding!"
|
34 |
+
)
|
35 |
|
36 |
if "vectordb" not in st.session_state:
|
37 |
st.session_state.vectordb = None
|
38 |
|
39 |
|
40 |
+
def get_vectordb_from_text(text):
|
41 |
embeddings = OpenAIEmbeddings()
|
42 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
43 |
texts = text_splitter.split_text(text)
|
|
|
48 |
return vectordb
|
49 |
|
50 |
|
51 |
+
def get_vectordb_from_pdf(uploaded_pdf):
|
52 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
53 |
tmp_file.write(uploaded_pdf.read())
|
54 |
tmp_file_path = tmp_file.name
|
|
|
64 |
return vectordb
|
65 |
|
66 |
|
67 |
+
def get_vectordb_from_website(website_url):
|
68 |
+
loader = WebBaseLoader(website_url)
|
69 |
+
pages = loader.load()
|
70 |
+
embeddings = OpenAIEmbeddings()
|
71 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
72 |
+
docs = text_splitter.split_documents(pages)
|
73 |
+
vectordb = Chroma.from_documents(
|
74 |
+
documents=docs,
|
75 |
+
embedding=embeddings,
|
76 |
+
)
|
77 |
+
|
78 |
+
|
79 |
+
def get_vectordb_from_youtube(youtube_url):
|
80 |
save_dir = "docs/youtube"
|
81 |
loader = GenericLoader(
|
82 |
YoutubeAudioLoader([youtube_url], save_dir), OpenAIWhisperParser()
|
|
|
94 |
if data_type == "Text":
|
95 |
user_text = st.text_area("Enter text data")
|
96 |
if st.button("Add"):
|
97 |
+
st.session_state.vectordb = get_vectordb_from_text(user_text)
|
98 |
|
99 |
elif data_type == "PDF":
|
100 |
uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
|
101 |
if st.button("Add"):
|
102 |
+
st.session_state.vectordb = get_vectordb_from_pdf(uploaded_pdf)
|
103 |
|
104 |
+
elif data_type == "Website":
|
105 |
+
website_url = st.text_input("Enter website URL")
|
106 |
+
if st.button("Add"):
|
107 |
+
st.session_state.vectordb = get_vectordb_from_website(website_url)
|
108 |
else:
|
109 |
youtube_url = st.text_input("Enter YouTube URL")
|
110 |
if st.button("Add"):
|
111 |
+
st.session_state.vectordb = get_vectordb_from_youtube(youtube_url)
|
112 |
|
113 |
+
llm = ChatOpenAI(api_key=openai_api_key, temperature=0.2, model="gpt-3.5-turbo")
|
|
|
|
|
114 |
|
115 |
|
116 |
def get_context_retreiver_chain(vectordb):
|
|
|
133 |
|
134 |
|
135 |
def get_conversational_rag_chain(retriever_chain):
|
136 |
+
prompt = ChatPromptTemplate.from_messages(
|
137 |
+
[
|
138 |
+
(
|
139 |
+
"system",
|
140 |
+
"Answer the user's questions based on the below context:\n\n{context}",
|
141 |
+
),
|
142 |
+
MessagesPlaceholder(variable_name="chat_history"),
|
143 |
+
("user", "{input}"),
|
144 |
+
]
|
145 |
+
)
|
146 |
|
147 |
stuff_domain_chain = create_stuff_documents_chain(llm, prompt)
|
148 |
|
|
|
152 |
def get_response(user_input):
|
153 |
if st.session_state.vectordb is None:
|
154 |
return "Please add data first"
|
155 |
+
|
156 |
retrieveal_chain = get_context_retreiver_chain(st.session_state.vectordb)
|
157 |
converasational_rag_chain = get_conversational_rag_chain(retrieveal_chain)
|
158 |
|
159 |
+
response = converasational_rag_chain.invoke(
|
160 |
+
{"chat_history": st.session_state.chat_history, "input": user_input}
|
161 |
+
)
|
|
|
162 |
|
163 |
+
return response["answer"]
|
164 |
|
165 |
|
166 |
user_query = st.chat_input("Your message")
|
requirements.txt
CHANGED
@@ -3,6 +3,7 @@ langchain_community
|
|
3 |
langchain_openai
|
4 |
langchain_pinecone
|
5 |
pypdf
|
|
|
6 |
yt_dlp
|
7 |
pydub
|
8 |
chromadb
|
|
|
3 |
langchain_openai
|
4 |
langchain_pinecone
|
5 |
pypdf
|
6 |
+
beautifulsoup4
|
7 |
yt_dlp
|
8 |
pydub
|
9 |
chromadb
|