add rag
Browse files- app.py +87 -23
- docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/data_level0.bin +3 -0
- docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/header.bin +3 -0
- docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/length.bin +3 -0
- docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/link_lists.bin +0 -0
- docs/chroma/chroma.sqlite3 +0 -0
- docs/ttdn.pdf +0 -0
- requirements.txt +6 -1
app.py
CHANGED
@@ -1,33 +1,97 @@
|
|
1 |
import os
|
2 |
-
|
3 |
import streamlit as st
|
4 |
|
5 |
-
from langchain_openai import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from langchain_core.prompts import ChatPromptTemplate
|
|
|
|
|
7 |
|
8 |
-
|
9 |
st.title("Chat with your data")
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
|
|
|
|
|
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
api_key=os.environ.get("OPENAI_API_KEY"),
|
15 |
temperature=0.2,
|
16 |
-
model='gpt-3.5-turbo
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
2 |
import streamlit as st
|
3 |
|
4 |
+
from langchain_openai import OpenAIEmbeddings
|
5 |
+
from langchain_openai.chat_models import ChatOpenAI
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from langchain_community.document_loaders import PyPDFLoader
|
8 |
+
from langchain_community.document_loaders.generic import GenericLoader
|
9 |
+
from langchain_community.document_loaders.parsers import OpenAIWhisperParser
|
10 |
+
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
|
11 |
+
from langchain_community.vectorstores import Chroma
|
12 |
from langchain_core.prompts import ChatPromptTemplate
|
13 |
+
from langchain.memory import ConversationBufferMemory
|
14 |
+
from langchain.chains import RetrievalQA
|
15 |
|
16 |
+
st.set_page_config(page_title="Chat with your data", page_icon="🤖")
|
17 |
st.title("Chat with your data")
|
18 |
+
st.header("Add your data for RAG")
|
19 |
+
|
20 |
+
data_type = st.radio("Choose the type of data to add:", ("Text", "PDF", "YouTube URL"))
|
21 |
+
|
22 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
23 |
+
pages = None
|
24 |
|
25 |
+
if data_type == "Text":
|
26 |
+
user_text = st.text_area("Enter text data")
|
27 |
+
if st.button("Add"):
|
28 |
+
pages = user_text
|
29 |
|
30 |
+
elif data_type == "PDF":
|
31 |
+
uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
|
32 |
+
if st.button("Add"):
|
33 |
+
loader = PyPDFLoader("docs/ttdn.pdf")
|
34 |
+
pages = loader.load()
|
35 |
+
|
36 |
+
elif data_type == "YouTube URL":
|
37 |
+
youtube_url = st.text_input("Enter YouTube URL")
|
38 |
+
if st.button("Add"):
|
39 |
+
save_dir="docs/youtube"
|
40 |
+
loader = GenericLoader(
|
41 |
+
YoutubeAudioLoader([youtube_url], save_dir),
|
42 |
+
OpenAIWhisperParser()
|
43 |
+
)
|
44 |
+
|
45 |
+
pages = loader.load()
|
46 |
+
|
47 |
+
llm = ChatOpenAI(
|
48 |
api_key=os.environ.get("OPENAI_API_KEY"),
|
49 |
temperature=0.2,
|
50 |
+
model='gpt-3.5-turbo')
|
51 |
+
|
52 |
+
|
53 |
+
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible.
|
54 |
+
Context: {context}
|
55 |
+
Question: {question}
|
56 |
+
Helpful Answer:"""
|
57 |
+
|
58 |
+
prompt = ChatPromptTemplate.from_template(template)
|
59 |
+
|
60 |
+
if pages:
|
61 |
+
embedding = OpenAIEmbeddings()
|
62 |
+
if data_type == "Text":
|
63 |
+
texts = text_splitter.split_text(pages)
|
64 |
+
vectordb = Chroma.from_texts(
|
65 |
+
texts=texts,
|
66 |
+
embedding=embedding,
|
67 |
+
persist_directory='docs/chroma/'
|
68 |
+
)
|
69 |
+
else:
|
70 |
+
docs = text_splitter.split_documents(pages)
|
71 |
+
|
72 |
+
vectordb = Chroma.from_documents(
|
73 |
+
documents=docs,
|
74 |
+
embedding=embedding,
|
75 |
+
persist_directory='docs/chroma/'
|
76 |
+
)
|
77 |
+
|
78 |
+
qa_chain = RetrievalQA.from_chain_type(
|
79 |
+
llm,
|
80 |
+
retriever=vectordb.as_retriever(),
|
81 |
+
return_source_documents=True,
|
82 |
+
chain_type_kwargs={"prompt": prompt}
|
83 |
+
)
|
84 |
+
|
85 |
+
result = qa_chain.invoke({"query": "What is BSM Labs"})
|
86 |
+
st.write(result["result"])
|
87 |
+
# st.session_state.retriever = vectordb.as_retriever()
|
88 |
+
|
89 |
+
# if "retriever" in st.session_state:
|
90 |
+
# user_query = st.chat_input("Ask a question")
|
91 |
+
# if user_query:
|
92 |
+
# chain = prompt | llm | parser
|
93 |
+
# response = chain.invoke(input={
|
94 |
+
# "context": st.session_state.retriever,
|
95 |
+
# "question": user_query
|
96 |
+
# })
|
97 |
+
# st.write(response)
|
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f18abd8c514282db82706e52b0a33ed659cd534e925a6f149deb7af9ce34bd8e
|
3 |
+
size 6284000
|
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:effaa959ce2b30070fdafc2fe82096fc46e4ee7561b75920dd3ce43d09679b21
|
3 |
+
size 100
|
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
|
3 |
+
size 4000
|
docs/chroma/a94be33e-75ea-4e61-9699-3f0ab772f12a/link_lists.bin
ADDED
File without changes
|
docs/chroma/chroma.sqlite3
ADDED
Binary file (479 kB). View file
|
|
docs/ttdn.pdf
ADDED
Binary file (147 kB). View file
|
|
requirements.txt
CHANGED
@@ -1,2 +1,7 @@
|
|
1 |
langchain
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
langchain
|
2 |
+
langchain_community
|
3 |
+
langchain_openai
|
4 |
+
pypdf
|
5 |
+
yt_dlp
|
6 |
+
pydub
|
7 |
+
chromadb
|