sahanes commited on
Commit
34cce64
Β·
1 Parent(s): 5d83944
Files changed (6) hide show
  1. .gitignore +6 -0
  2. Dockerfile +13 -0
  3. app.py +210 -0
  4. chainlit.md +14 -0
  5. data/finantial_report.pdf +0 -0
  6. requirements.txt +9 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ .chainlit
4
+ *.faiss
5
+ *.pkl
6
+ .files
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install --upgrade pip
10
+ RUN pip install -r requirements.txt
11
+ RUN mkdir -p $HOME/app/data/vectorstore && chown -R user:user $HOME/app/data
12
+ COPY . .
13
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chainlit as cl
3
+ from dotenv import load_dotenv
4
+ from numpy import arange
5
+ from operator import itemgetter
6
+ from langchain_huggingface import HuggingFaceEndpoint
7
+ from langchain_community.document_loaders import PyPDFLoader
8
+ from langchain import text_splitter
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
+ from langchain_community.vectorstores import FAISS
11
+ from langchain_huggingface import HuggingFaceEndpointEmbeddings
12
+ from langchain_core.prompts import PromptTemplate
13
+ from langchain.schema.output_parser import StrOutputParser
14
+ from langchain.schema.runnable import RunnablePassthrough
15
+ from langchain.schema.runnable.config import RunnableConfig
16
+
17
+ # GLOBAL SCOPE - ENTIRE APPLICATION HAS ACCESS TO VALUES SET IN THIS SCOPE #
18
+ # ---- ENV VARIABLES ---- #
19
+ """
20
+ This function will load our environment file (.env) if it is present.
21
+
22
+ NOTE: Make sure that .env is in your .gitignore file - it is by default, but please ensure it remains there.
23
+ """
24
+ load_dotenv()
25
+
26
+ """
27
+ We will load our environment variables here.
28
+ """
29
+ HF_LLM_ENDPOINT = os.environ["HF_LLM_ENDPOINT"]
30
+ HF_EMBED_ENDPOINT = os.environ["HF_EMBED_ENDPOINT"]
31
+ HF_TOKEN = os.environ["HF_TOKEN"]
32
+
33
+ # ---- GLOBAL DECLARATIONS ---- #
34
+ # added for Docker purposes compared to run chainlit app
35
+ DATA_DIR = "./data"
36
+ VECTORSTORE_DIR = os.path.join(DATA_DIR, "vectorstore")
37
+ VECTORSTORE_PATH = os.path.join(VECTORSTORE_DIR, "index.faiss")
38
+
39
+
40
+ # -- RETRIEVAL -- #
41
+ """
42
+ 1. Load Documents from Text File
43
+ 2. Split Documents into Chunks
44
+ 3. Load HuggingFace Embeddings (remember to use the URL we set above)
45
+ 4. Index Files if they do not exist, otherwise load the vectorstore
46
+ """
47
+ ### 1. CREATE TEXT LOADER AND LOAD DOCUMENTS
48
+ ### NOTE: PAY ATTENTION TO THE PATH THEY ARE IN.
49
+
50
+ # wget --no-check-certificate 'https://drive.google.com/uc?id=1tGmnWoO-wtU_bTs_M1GVXrTB5Su61zLg' -O data/finantial_report.pdf
51
+
52
+ # loader = PyPDFLoader("/home/sahane/AIE3/Week 4/Day 1/Airbnb-10K/data/finantial_report.pdf")
53
+ # changed for Docker purpose
54
+ loader = PyPDFLoader("./data/finantial_report.pdf")
55
+ pages = loader.load_and_split()
56
+
57
+ # I noticed the the first two pages could be not included'
58
+ text_content=[]
59
+ [text_content.append((pages[i].page_content.replace('Table of Contents\n', ''), {'page source': i})) for i in arange(2,len(pages))]
60
+
61
+ # There are some expression that could help structured and unstructured texts be separated
62
+ import re
63
+
64
+ # Regular expression patterns for identifying structured and unstructured sections
65
+ structured_pattern = re.compile(r"\(in millions(?:, except\b.*)?\)|\b(unaudited)\b|\bBalance Sheet\b|\bIncome Statement\b|\bCash Flows\b|\bfollowing table\b", re.IGNORECASE)
66
+ # Split the text content
67
+ structured_data = []
68
+ unstructured_data = []
69
+ for text in text_content:
70
+ if structured_pattern.search(text[0]):
71
+ structured_data.append(text[0] + str(text[1]))
72
+ else:
73
+ unstructured_data.append(text[0] + str(text[1]))
74
+
75
+
76
+ #Alldoc = text_splitter.create_documents(structured_data + unstructured_data)
77
+ from langchain.schema import Document
78
+ documents = []
79
+ for idx, text in enumerate(structured_data + unstructured_data):
80
+ document = Document(id=idx, page_content=text)
81
+ documents.append(document)
82
+
83
+
84
+ ### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
85
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
86
+ # split_chunks = text_splitter.split_documents(Alldoc)
87
+ split_chunks = text_splitter.split_documents(documents)
88
+
89
+
90
+ ### 3. LOAD HUGGINGFACE EMBEDDINGS
91
+ hf_embeddings = HuggingFaceEndpointEmbeddings(
92
+ model=HF_EMBED_ENDPOINT,
93
+ task="feature-extraction",
94
+ huggingfacehub_api_token=os.environ["HF_TOKEN"],
95
+ )
96
+ ## Prevent re-indexing if vectorstores already exists
97
+ if os.path.exists(VECTORSTORE_DIR):
98
+ vectorstore = FAISS.load_local(
99
+ VECTORSTORE_DIR,#"./data/vectorstore",
100
+ hf_embeddings,
101
+ allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
102
+ )
103
+ hf_retriever = vectorstore.as_retriever()
104
+ print("Loaded Vectorstore")
105
+ else:
106
+ print("Indexing Files")
107
+ os.makedirs(VECTORSTORE_DIR, exist_ok=True)
108
+ ### 4. INDEX FILES
109
+ ### NOTE: REMEMBER TO BATCH THE DOCUMENTS WITH MAXIMUM BATCH SIZE = 32
110
+ for i in range(0, len(split_chunks), 32):
111
+ if i == 0:
112
+ vectorstore = FAISS.from_documents(split_chunks[i:i+32], hf_embeddings)
113
+ continue
114
+ vectorstore.add_documents(split_chunks[i:i+32])
115
+ vectorstore.save_local("./data/vectorstore")
116
+ hf_retriever = vectorstore.as_retriever()
117
+
118
+ # -- AUGMENTED -- #
119
+ """
120
+ 1. Define a String Template
121
+ 2. Create a Prompt Template from the String Template
122
+ """
123
+ ### 1. DEFINE STRING TEMPLATE
124
+ RAG_PROMPT_TEMPLATE = """\
125
+ <|start_header_id|>system<|end_header_id|>
126
+ You are a helpful assistant. You answer user questions based on provided context. If you can't answer the question with the provided context,\
127
+ say you don't know.<|eot_id|>
128
+
129
+ <|start_header_id|>user<|end_header_id|>
130
+ User Query:
131
+ {query}
132
+
133
+ Context:
134
+ {context}<|eot_id|>
135
+
136
+ <|start_header_id|>assistant<|end_header_id|>
137
+ """
138
+ #Note that we do not have the response here. We have assistent, we ONLY start, but not followed by <|eot_id> as we do not have a response YET.
139
+
140
+ ### 2. CREATE PROMPT TEMPLATE
141
+ rag_prompt =PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
142
+
143
+ # -- GENERATION -- #
144
+ """
145
+ 1. Create a HuggingFaceEndpoint for the LLM
146
+ """
147
+ ### 1. CREATE HUGGINGFACE ENDPOINT FOR LLM
148
+ hf_llm = HuggingFaceEndpoint(
149
+ endpoint_url=f"{HF_LLM_ENDPOINT}",
150
+ max_new_tokens=512,
151
+ top_k=10,
152
+ top_p=0.95,
153
+ typical_p=0.95,
154
+ temperature=0.01,
155
+ repetition_penalty=1.03,
156
+ huggingfacehub_api_token=os.environ["HF_TOKEN"]
157
+ )
158
+
159
+ @cl.author_rename
160
+ def rename(original_author: str):
161
+ """
162
+ This function can be used to rename the 'author' of a message.
163
+
164
+ In this case, we're overriding the 'Assistant' author to be 'Paul Graham Essay Bot'.
165
+ """
166
+ rename_dict = {
167
+ "Assistant" : "Airbnb 10k Bot"
168
+ }
169
+ return rename_dict.get(original_author, original_author)
170
+
171
+ @cl.on_chat_start
172
+ async def start_chat():
173
+ """
174
+ This function will be called at the start of every user session.
175
+
176
+ We will build our LCEL RAG chain here, and store it in the user session.
177
+
178
+ The user session is a dictionary that is unique to each user session, and is stored in the memory of the server.
179
+ """
180
+
181
+ ### BUILD LCEL RAG CHAIN THAT ONLY RETURNS TEXT
182
+ lcel_rag_chain = ( {"context": itemgetter("query") | hf_retriever, "query": itemgetter("query")}
183
+
184
+ | rag_prompt | hf_llm
185
+ )
186
+
187
+ cl.user_session.set("lcel_rag_chain", lcel_rag_chain)
188
+
189
+ @cl.on_message
190
+ async def main(message: cl.Message):
191
+ """
192
+ This function will be called every time a message is recieved from a session.
193
+
194
+ We will use the LCEL RAG chain to generate a response to the user query.
195
+
196
+ The LCEL RAG chain is stored in the user session, and is unique to each user session - this is why we can access it here.
197
+ """
198
+ lcel_rag_chain = cl.user_session.get("lcel_rag_chain")
199
+
200
+ msg = cl.Message(content="")
201
+
202
+ async for chunk in lcel_rag_chain.astream(
203
+ {"query": message.content},
204
+ config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
205
+ ):
206
+ await msg.stream_token(chunk)
207
+
208
+ await msg.send()
209
+
210
+ # docker build -t airbnb-llm-chainrag-chainlit-hfs .
chainlit.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Welcome to Chainlit! πŸš€πŸ€–
2
+
3
+ Hi there, Developer! πŸ‘‹ We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs.
4
+
5
+ ## Useful Links πŸ”—
6
+
7
+ - **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) πŸ“š
8
+ - **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! πŸ’¬
9
+
10
+ We can't wait to see what you create with Chainlit! Happy coding! πŸ’»πŸ˜Š
11
+
12
+ ## Welcome screen
13
+
14
+ To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty.
data/finantial_report.pdf ADDED
Binary file (596 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ chainlit==0.7.700
2
+ langchain==0.2.5
3
+ langchain_community==0.2.5
4
+ langchain_core==0.2.9
5
+ langchain_huggingface==0.0.3
6
+ langchain_text_splitters==0.2.1
7
+ python-dotenv==1.0.1
8
+ faiss-cpu
9
+ pypdf