hail75 commited on
Commit
7f07a51
·
1 Parent(s): a007d8f
.vscode/launch.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.2.0",
3
+ "configurations": [
4
+ {
5
+ "name": "Python:Streamlit",
6
+ "type": "debugpy",
7
+ "request": "launch",
8
+ "module": "streamlit",
9
+ "args": [
10
+ "run",
11
+ "${file}",
12
+ "--server.port",
13
+ "2000"
14
+ ]
15
+ }
16
+ ]
17
+ }
app.py CHANGED
@@ -7,11 +7,18 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain_community.document_loaders import PyPDFLoader
8
  from langchain_community.document_loaders.generic import GenericLoader
9
  from langchain_community.document_loaders.parsers import OpenAIWhisperParser
10
- from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
 
 
11
  from langchain_community.vectorstores import Chroma
12
- from langchain_core.prompts import ChatPromptTemplate
13
- from langchain.memory import ConversationBufferMemory
14
- from langchain.chains import RetrievalQA
 
 
 
 
 
15
 
16
  st.set_page_config(page_title="Chat with your data", page_icon="🤖")
17
  st.title("Chat with your data")
@@ -19,79 +26,135 @@ st.header("Add your data for RAG")
19
 
20
  data_type = st.radio("Choose the type of data to add:", ("Text", "PDF", "YouTube URL"))
21
 
22
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
23
- pages = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  if data_type == "Text":
26
  user_text = st.text_area("Enter text data")
27
  if st.button("Add"):
28
- pages = user_text
29
 
30
  elif data_type == "PDF":
31
  uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
32
  if st.button("Add"):
33
- loader = PyPDFLoader("docs/ttdn.pdf")
34
- pages = loader.load()
35
 
36
- elif data_type == "YouTube URL":
37
  youtube_url = st.text_input("Enter YouTube URL")
38
  if st.button("Add"):
39
- save_dir="docs/youtube"
40
- loader = GenericLoader(
41
- YoutubeAudioLoader([youtube_url], save_dir),
42
- OpenAIWhisperParser()
43
- )
44
-
45
- pages = loader.load()
46
 
47
  llm = ChatOpenAI(
48
- api_key=os.environ.get("OPENAI_API_KEY"),
49
- temperature=0.2,
50
- model='gpt-3.5-turbo')
51
-
52
-
53
- template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
54
- Context: {context}
55
- Question: {question}
56
- Helpful Answer:"""
57
-
58
- prompt = ChatPromptTemplate.from_template(template)
59
-
60
- if pages:
61
- embedding = OpenAIEmbeddings()
62
- if data_type == "Text":
63
- texts = text_splitter.split_text(pages)
64
- vectordb = Chroma.from_texts(
65
- texts=texts,
66
- embedding=embedding,
67
- persist_directory='docs/chroma/'
68
- )
69
- else:
70
- docs = text_splitter.split_documents(pages)
71
-
72
- vectordb = Chroma.from_documents(
73
- documents=docs,
74
- embedding=embedding,
75
- persist_directory='docs/chroma/'
76
- )
77
-
78
- qa_chain = RetrievalQA.from_chain_type(
79
- llm,
80
- retriever=vectordb.as_retriever(),
81
- return_source_documents=True,
82
- chain_type_kwargs={"prompt": prompt}
83
  )
84
 
85
- result = qa_chain.invoke({"query": "What is BSM Labs"})
86
- st.write(result["result"])
87
- # st.session_state.retriever = vectordb.as_retriever()
88
-
89
- # if "retriever" in st.session_state:
90
- # user_query = st.chat_input("Ask a question")
91
- # if user_query:
92
- # chain = prompt | llm | parser
93
- # response = chain.invoke(input={
94
- # "context": st.session_state.retriever,
95
- # "question": user_query
96
- # })
97
- # st.write(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from langchain_community.document_loaders import PyPDFLoader
8
  from langchain_community.document_loaders.generic import GenericLoader
9
  from langchain_community.document_loaders.parsers import OpenAIWhisperParser
10
+ from langchain_community.document_loaders.blob_loaders.youtube_audio import (
11
+ YoutubeAudioLoader,
12
+ )
13
  from langchain_community.vectorstores import Chroma
14
+ from langchain_core.messages import HumanMessage, AIMessage
15
+ from langchain_core.output_parsers import StrOutputParser
16
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
17
+ from langchain.chains import create_history_aware_retriever, create_retrieval_chain
18
+ from langchain.chains.combine_documents import create_stuff_documents_chain
19
+
20
+
21
+ openai_api_key = os.getenv("OPENAI_API_KEY")
22
 
23
  st.set_page_config(page_title="Chat with your data", page_icon="🤖")
24
  st.title("Chat with your data")
 
26
 
27
  data_type = st.radio("Choose the type of data to add:", ("Text", "PDF", "YouTube URL"))
28
 
29
+ if "vectordb" not in st.session_state:
30
+ st.session_state.vectordb = None
31
+
32
+
33
+ def add_text_to_chroma(text):
34
+ embeddings = OpenAIEmbeddings()
35
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
36
+ texts = text_splitter.split_text(text)
37
+ vectordb = Chroma.from_texts(
38
+ texts=texts,
39
+ embedding=embeddings,
40
+ )
41
+ return vectordb
42
+
43
+
44
+ def add_pdf_to_chroma(uploaded_pdf):
45
+ loader = PyPDFLoader(uploaded_pdf)
46
+ pages = loader.load()
47
+ embeddings = OpenAIEmbeddings()
48
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
49
+ docs = text_splitter.split_documents(pages)
50
+ vectordb = Chroma.from_documents(
51
+ documents=docs,
52
+ embedding=embeddings,
53
+ )
54
+ return vectordb
55
+
56
+
57
+ def add_youtube_to_chroma(youtube_url):
58
+ save_dir = "docs/youtube"
59
+ loader = GenericLoader(
60
+ YoutubeAudioLoader([youtube_url], save_dir), OpenAIWhisperParser()
61
+ )
62
+ pages = loader.load()
63
+ embeddings = OpenAIEmbeddings()
64
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
65
+ docs = text_splitter.split_documents(pages)
66
+ vectordb = Chroma.from_documents(
67
+ documents=docs, embedding=embeddings, persist_directory="chroma"
68
+ )
69
+ return vectordb
70
+
71
 
72
  if data_type == "Text":
73
  user_text = st.text_area("Enter text data")
74
  if st.button("Add"):
75
+ st.session_state.vectordb = add_text_to_chroma(user_text)
76
 
77
  elif data_type == "PDF":
78
  uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
79
  if st.button("Add"):
80
+ st.session_state.vectordb = add_pdf_to_chroma(uploaded_pdf)
 
81
 
82
+ else:
83
  youtube_url = st.text_input("Enter YouTube URL")
84
  if st.button("Add"):
85
+ st.session_state.vectordb = add_youtube_to_chroma(youtube_url)
 
 
 
 
 
 
86
 
87
  llm = ChatOpenAI(
88
+ api_key=openai_api_key, temperature=0.2, model="gpt-3.5-turbo"
89
+ )
90
+
91
+
92
+ def get_context_retreiver_chain(vectordb):
93
+ retriever = vectordb.as_retriever()
94
+
95
+ prompt = ChatPromptTemplate.from_messages(
96
+ [
97
+ MessagesPlaceholder(variable_name="chat_history"),
98
+ ("user", "{input}"),
99
+ (
100
+ "user",
101
+ "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation",
102
+ ),
103
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  )
105
 
106
+ retriever_chain = create_history_aware_retriever(llm, retriever, prompt)
107
+
108
+ return retriever_chain
109
+
110
+
111
+ def get_conversational_rag_chain(retriever_chain):
112
+ prompt = ChatPromptTemplate.from_messages([
113
+ ("system", "Answer the user's questions based on the below context:\n\n{context}"),
114
+ MessagesPlaceholder(variable_name="chat_history"),
115
+ ("user", "{input}"),
116
+ ])
117
+
118
+ stuff_domain_chain = create_stuff_documents_chain(llm, prompt)
119
+
120
+ return create_retrieval_chain(retriever_chain, stuff_domain_chain)
121
+
122
+
123
+ def get_response(user_input):
124
+ if st.session_state.vectordb is None:
125
+ return "Please add data first"
126
+
127
+ retrieveal_chain = get_context_retreiver_chain(st.session_state.vectordb)
128
+ converasational_rag_chain = get_conversational_rag_chain(retrieveal_chain)
129
+
130
+ response = converasational_rag_chain.invoke({
131
+ "chat_history": st.session_state.chat_history,
132
+ "input": user_input
133
+ })
134
+
135
+ return response
136
+
137
+
138
+ user_query = st.chat_input("Your message")
139
+
140
+ if "chat_history" not in st.session_state:
141
+ st.session_state.chat_history = []
142
+
143
+ for message in st.session_state.chat_history:
144
+ if isinstance(message, HumanMessage):
145
+ with st.chat_message("Human"):
146
+ st.markdown(message.content)
147
+ else:
148
+ with st.chat_message("AI"):
149
+ st.markdown(message.content)
150
+
151
+ if user_query and user_query != "":
152
+ with st.chat_message("Human"):
153
+ st.markdown(user_query)
154
+
155
+ with st.chat_message("AI"):
156
+ ai_response = get_response(user_query)
157
+ st.markdown(ai_response)
158
+
159
+ st.session_state.chat_history.append(HumanMessage(user_query))
160
+ st.session_state.chat_history.append(AIMessage(ai_response))
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/data_level0.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f18abd8c514282db82706e52b0a33ed659cd534e925a6f149deb7af9ce34bd8e
3
- size 6284000
 
 
 
 
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/header.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:effaa959ce2b30070fdafc2fe82096fc46e4ee7561b75920dd3ce43d09679b21
3
- size 100
 
 
 
 
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/length.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
3
- size 4000
 
 
 
 
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/link_lists.bin DELETED
File without changes
docs/chroma/chroma.sqlite3 DELETED
Binary file (479 kB)
 
docs/ttdn.pdf DELETED
Binary file (147 kB)
 
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  langchain
2
  langchain_community
3
  langchain_openai
 
4
  pypdf
5
  yt_dlp
6
  pydub
 
1
  langchain
2
  langchain_community
3
  langchain_openai
4
+ langchain_pinecone
5
  pypdf
6
  yt_dlp
7
  pydub