hail75 commited on
Commit
e3ada61
·
1 Parent(s): 8c922bb

add website option

Browse files
Files changed (2) hide show
  1. app.py +47 -23
  2. requirements.txt +1 -0
app.py CHANGED
@@ -5,7 +5,7 @@ import tempfile
5
  from langchain_openai import OpenAIEmbeddings
6
  from langchain_openai.chat_models import ChatOpenAI
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain_community.document_loaders import PyPDFLoader
9
  from langchain_community.document_loaders.generic import GenericLoader
10
  from langchain_community.document_loaders.parsers import OpenAIWhisperParser
11
  from langchain_community.document_loaders.blob_loaders.youtube_audio import (
@@ -13,7 +13,6 @@ from langchain_community.document_loaders.blob_loaders.youtube_audio import (
13
  )
14
  from langchain_community.vectorstores import Chroma
15
  from langchain_core.messages import HumanMessage, AIMessage
16
- from langchain_core.output_parsers import StrOutputParser
17
  from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
18
  from langchain.chains import create_history_aware_retriever, create_retrieval_chain
19
  from langchain.chains.combine_documents import create_stuff_documents_chain
@@ -25,13 +24,20 @@ st.set_page_config(page_title="Chat with your data", page_icon="🤖")
25
  st.title("Chat with your data")
26
  st.header("Add your data for RAG")
27
 
28
- data_type = st.radio("Choose the type of data to add:", ("Text", "PDF", "YouTube URL"))
 
 
 
 
 
 
 
29
 
30
  if "vectordb" not in st.session_state:
31
  st.session_state.vectordb = None
32
 
33
 
34
- def add_text_to_chroma(text):
35
  embeddings = OpenAIEmbeddings()
36
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
37
  texts = text_splitter.split_text(text)
@@ -42,7 +48,7 @@ def add_text_to_chroma(text):
42
  return vectordb
43
 
44
 
45
- def add_pdf_to_chroma(uploaded_pdf):
46
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
47
  tmp_file.write(uploaded_pdf.read())
48
  tmp_file_path = tmp_file.name
@@ -58,7 +64,19 @@ def add_pdf_to_chroma(uploaded_pdf):
58
  return vectordb
59
 
60
 
61
- def add_youtube_to_chroma(youtube_url):
 
 
 
 
 
 
 
 
 
 
 
 
62
  save_dir = "docs/youtube"
63
  loader = GenericLoader(
64
  YoutubeAudioLoader([youtube_url], save_dir), OpenAIWhisperParser()
@@ -76,21 +94,23 @@ def add_youtube_to_chroma(youtube_url):
76
  if data_type == "Text":
77
  user_text = st.text_area("Enter text data")
78
  if st.button("Add"):
79
- st.session_state.vectordb = add_text_to_chroma(user_text)
80
 
81
  elif data_type == "PDF":
82
  uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
83
  if st.button("Add"):
84
- st.session_state.vectordb = add_pdf_to_chroma(uploaded_pdf)
85
 
 
 
 
 
86
  else:
87
  youtube_url = st.text_input("Enter YouTube URL")
88
  if st.button("Add"):
89
- st.session_state.vectordb = add_youtube_to_chroma(youtube_url)
90
 
91
- llm = ChatOpenAI(
92
- api_key=openai_api_key, temperature=0.2, model="gpt-3.5-turbo"
93
- )
94
 
95
 
96
  def get_context_retreiver_chain(vectordb):
@@ -113,11 +133,16 @@ def get_context_retreiver_chain(vectordb):
113
 
114
 
115
  def get_conversational_rag_chain(retriever_chain):
116
- prompt = ChatPromptTemplate.from_messages([
117
- ("system", "Answer the user's questions based on the below context:\n\n{context}"),
118
- MessagesPlaceholder(variable_name="chat_history"),
119
- ("user", "{input}"),
120
- ])
 
 
 
 
 
121
 
122
  stuff_domain_chain = create_stuff_documents_chain(llm, prompt)
123
 
@@ -127,16 +152,15 @@ def get_conversational_rag_chain(retriever_chain):
127
  def get_response(user_input):
128
  if st.session_state.vectordb is None:
129
  return "Please add data first"
130
-
131
  retrieveal_chain = get_context_retreiver_chain(st.session_state.vectordb)
132
  converasational_rag_chain = get_conversational_rag_chain(retrieveal_chain)
133
 
134
- response = converasational_rag_chain.invoke({
135
- "chat_history": st.session_state.chat_history,
136
- "input": user_input
137
- })
138
 
139
- return response['answer']
140
 
141
 
142
  user_query = st.chat_input("Your message")
 
5
  from langchain_openai import OpenAIEmbeddings
6
  from langchain_openai.chat_models import ChatOpenAI
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
9
  from langchain_community.document_loaders.generic import GenericLoader
10
  from langchain_community.document_loaders.parsers import OpenAIWhisperParser
11
  from langchain_community.document_loaders.blob_loaders.youtube_audio import (
 
13
  )
14
  from langchain_community.vectorstores import Chroma
15
  from langchain_core.messages import HumanMessage, AIMessage
 
16
  from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
17
  from langchain.chains import create_history_aware_retriever, create_retrieval_chain
18
  from langchain.chains.combine_documents import create_stuff_documents_chain
 
24
  st.title("Chat with your data")
25
  st.header("Add your data for RAG")
26
 
27
+ data_type = st.radio(
28
+ "Choose the type of data to add:", ("Text", "PDF", "Website", "YouTube")
29
+ )
30
+
31
+ if data_type == "YouTube":
32
+ st.warning(
33
+ "Note: Processing YouTube videos can be quite costly for me in terms of money. Please use this option sparingly. Thank you for your understanding!"
34
+ )
35
 
36
  if "vectordb" not in st.session_state:
37
  st.session_state.vectordb = None
38
 
39
 
40
+ def get_vectordb_from_text(text):
41
  embeddings = OpenAIEmbeddings()
42
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
43
  texts = text_splitter.split_text(text)
 
48
  return vectordb
49
 
50
 
51
+ def get_vectordb_from_pdf(uploaded_pdf):
52
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
53
  tmp_file.write(uploaded_pdf.read())
54
  tmp_file_path = tmp_file.name
 
64
  return vectordb
65
 
66
 
67
+ def get_vectordb_from_website(website_url):
68
+ loader = WebBaseLoader(website_url)
69
+ pages = loader.load()
70
+ embeddings = OpenAIEmbeddings()
71
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
72
+ docs = text_splitter.split_documents(pages)
73
+ vectordb = Chroma.from_documents(
74
+ documents=docs,
75
+ embedding=embeddings,
76
+ )
77
+
78
+
79
+ def get_vectordb_from_youtube(youtube_url):
80
  save_dir = "docs/youtube"
81
  loader = GenericLoader(
82
  YoutubeAudioLoader([youtube_url], save_dir), OpenAIWhisperParser()
 
94
  if data_type == "Text":
95
  user_text = st.text_area("Enter text data")
96
  if st.button("Add"):
97
+ st.session_state.vectordb = get_vectordb_from_text(user_text)
98
 
99
  elif data_type == "PDF":
100
  uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
101
  if st.button("Add"):
102
+ st.session_state.vectordb = get_vectordb_from_pdf(uploaded_pdf)
103
 
104
+ elif data_type == "Website":
105
+ website_url = st.text_input("Enter website URL")
106
+ if st.button("Add"):
107
+ st.session_state.vectordb = get_vectordb_from_website(website_url)
108
  else:
109
  youtube_url = st.text_input("Enter YouTube URL")
110
  if st.button("Add"):
111
+ st.session_state.vectordb = get_vectordb_from_youtube(youtube_url)
112
 
113
+ llm = ChatOpenAI(api_key=openai_api_key, temperature=0.2, model="gpt-3.5-turbo")
 
 
114
 
115
 
116
  def get_context_retreiver_chain(vectordb):
 
133
 
134
 
135
  def get_conversational_rag_chain(retriever_chain):
136
+ prompt = ChatPromptTemplate.from_messages(
137
+ [
138
+ (
139
+ "system",
140
+ "Answer the user's questions based on the below context:\n\n{context}",
141
+ ),
142
+ MessagesPlaceholder(variable_name="chat_history"),
143
+ ("user", "{input}"),
144
+ ]
145
+ )
146
 
147
  stuff_domain_chain = create_stuff_documents_chain(llm, prompt)
148
 
 
152
  def get_response(user_input):
153
  if st.session_state.vectordb is None:
154
  return "Please add data first"
155
+
156
  retrieveal_chain = get_context_retreiver_chain(st.session_state.vectordb)
157
  converasational_rag_chain = get_conversational_rag_chain(retrieveal_chain)
158
 
159
+ response = converasational_rag_chain.invoke(
160
+ {"chat_history": st.session_state.chat_history, "input": user_input}
161
+ )
 
162
 
163
+ return response["answer"]
164
 
165
 
166
  user_query = st.chat_input("Your message")
requirements.txt CHANGED
@@ -3,6 +3,7 @@ langchain_community
3
  langchain_openai
4
  langchain_pinecone
5
  pypdf
 
6
  yt_dlp
7
  pydub
8
  chromadb
 
3
  langchain_openai
4
  langchain_pinecone
5
  pypdf
6
+ beautifulsoup4
7
  yt_dlp
8
  pydub
9
  chromadb