robertselvam commited on
Commit
0ba90e2
·
1 Parent(s): 7934b86

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -212
app.py CHANGED
@@ -1,252 +1,252 @@
1
- from pydantic import NoneStr
2
- import os
3
- from langchain.chains.question_answering import load_qa_chain
4
- from langchain.document_loaders import UnstructuredFileLoader
5
- from langchain.embeddings.openai import OpenAIEmbeddings
6
- from langchain.llms import OpenAI
7
- from langchain.text_splitter import CharacterTextSplitter
8
- from langchain.vectorstores import FAISS
9
- from langchain.vectorstores import Chroma
10
- from langchain.chains import ConversationalRetrievalChain
11
- import gradio as gr
12
- import openai
13
- from langchain import PromptTemplate, OpenAI, LLMChain
14
- import validators
15
- import requests
16
- import mimetypes
17
- import tempfile
18
-
19
- class Chatbot:
20
- def __init__(self):
21
- openai.api_key = os.getenv("OPENAI_API_KEY")
22
- def get_empty_state(self):
23
-
24
- """ Create empty Knowledge base"""
25
 
 
 
26
  return {"knowledge_base": None}
27
 
28
- def create_knowledge_base(self,docs):
 
29
 
30
- """Create a knowledge base from the given documents.
31
  Args:
32
- docs (List[str]): List of documents.
 
33
  Returns:
34
- FAISS: Knowledge base built from the documents.
35
  """
 
 
 
 
 
36
 
37
- # Initialize a CharacterTextSplitter to split the documents into chunks
38
- # Each chunk has a maximum length of 500 characters
39
- # There is no overlap between the chunks
40
- text_splitter = CharacterTextSplitter(
41
- separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len
42
- )
43
-
44
- # Split the documents into chunks using the text_splitter
45
- chunks = text_splitter.split_documents(docs)
46
 
47
- # Initialize an OpenAIEmbeddings model to compute embeddings of the chunks
48
- embeddings = OpenAIEmbeddings()
49
 
50
- # Build a knowledge base using FAISS from the chunks and their embeddings
51
- knowledge_base = Chroma.from_documents(chunks, embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- # Return the resulting knowledge base
54
- return knowledge_base
55
 
 
 
56
 
57
- def upload_file(self,file_paths):
58
- """Upload a file and create a knowledge base from its contents.
59
  Args:
60
- file_paths : The files to uploaded.
 
61
  Returns:
62
- tuple: A tuple containing the file name and the knowledge base.
63
  """
 
64
 
65
- file_paths = [i.name for i in file_paths]
66
- print(file_paths)
67
 
 
 
 
68
 
69
- loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
 
70
 
71
- # Load the contents of the file using the loader
72
- docs = []
73
- for loader in loaders:
74
- docs.extend(loader.load())
75
 
76
- # Create a knowledge base from the loaded documents using the create_knowledge_base() method
77
- knowledge_base = self.create_knowledge_base(docs)
78
 
79
 
80
- # Return a tuple containing the file name and the knowledge base
81
- return file_paths, {"knowledge_base": knowledge_base}
82
 
83
- def add_text(self,history, text):
84
- history = history + [(text, None)]
85
- return history, gr.update(value="", interactive=False)
86
 
 
 
 
 
87
 
 
88
 
89
- def upload_multiple_urls(self,urls):
90
- urlss = [url.strip() for url in urls.split(',')]
91
- all_docs = []
92
- file_paths = []
93
- for url in urlss:
94
- if validators.url(url):
95
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
96
- r = requests.get(url,headers=headers)
97
- if r.status_code != 200:
98
- raise ValueError(
99
- "Check the url of your file; returned status code %s" % r.status_code
100
- )
101
- content_type = r.headers.get("content-type")
102
- file_extension = mimetypes.guess_extension(content_type)
103
- temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
104
- temp_file.write(r.content)
105
- file_path = temp_file.name
106
- file_paths.append(file_path)
107
 
108
- loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
 
 
109
 
110
- # Load the contents of the file using the loader
111
- docs = []
112
- for loader in loaders:
113
- docs.extend(loader.load())
114
 
115
- # Create a knowledge base from the loaded documents using the create_knowledge_base() method
116
- knowledge_base = self.create_knowledge_base(docs)
117
 
118
- return file_paths,{"knowledge_base":knowledge_base}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- def answer_question(self, question,history,state):
121
- """Answer a question based on the current knowledge base.
122
  Args:
123
- state (dict): The current state containing the knowledge base.
 
124
  Returns:
125
- str: The answer to the question.
126
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
 
128
 
129
- # Retrieve the knowledge base from the state dictionary
130
- knowledge_base = state["knowledge_base"]
131
- retriever = knowledge_base.as_retriever()
132
- qa = ConversationalRetrievalChain.from_llm(
133
- llm=OpenAI(temperature=0.5),
134
- retriever=retriever,
135
- return_source_documents=False)
136
- # Set the question for which we want to find the answer
137
- res = []
138
- question = history[-1][0]
139
- for human, ai in history[:-1]:
140
- pair = (human, ai)
141
- res.append(pair)
142
-
143
- chat_history = res
144
- #print(chat_history)
145
- query = question
146
- result = qa({"question": query, "chat_history": chat_history})
147
- # Perform a similarity search on the knowledge base to retrieve relevant documents
148
- response = result["answer"]
149
- # Return the response as the answer to the question
150
- history[-1][1] = response
151
- return history
152
-
153
-
154
- def extract_excel_data(self,file_path):
155
- # Read the Excel file
156
- df = pd.read_excel(file_path)
157
-
158
- # Flatten the data to a single list
159
- data_list = []
160
- for _, row in df.iterrows():
161
- data_list.extend(row.tolist())
162
-
163
- return data_list
164
-
165
- def comparing_chemicals(self,excel_file_path,chemicals):
166
- chemistry_capability = self.extract_excel_data(excel_file_path.name)
167
- response = openai.Completion.create(
168
- engine="text-davinci-003",
169
- prompt= f"""Analyse the following text delimited by triple backticks to return the comman chemicals.
170
- text : ```{chemicals} {chemistry_capability}```.
171
- result should be in bullet points format.
172
- """,
173
- max_tokens=100,
174
- n=1,
175
- stop=None,
176
- temperature=0,
177
- top_p=1.0,
178
- frequency_penalty=0.0,
179
- presence_penalty=0.0
180
  )
181
 
182
- result = response.choices[0].text.strip()
183
- return result
184
-
185
- def clear_function(self,state):
186
- state.clear()
187
- # state = gr.State(self.get_empty_state())
188
-
189
- def gradio_interface(self):
190
-
191
- """Create the Gradio interface for the Chemical Identifier."""
192
-
193
- with gr.Blocks(css="style.css",theme=gr.themes.Soft()) as demo:
194
- state = gr.State(self.get_empty_state())
195
- with gr.Column(elem_id="col-container"):
196
- gr.HTML(
197
- """<hr style="border-top: 5px solid white;">"""
198
- )
199
- gr.HTML(
200
- """<br>
201
- <h1 style="text-align:center;font-size:50px;">
202
- ADOPLE AI
203
- </h1> """
204
- )
205
- gr.HTML(
206
- """<br>
207
- <h1 style="text-align:center;">
208
- Multi URL and Doc Chatbot
209
- </h1> """
210
- )
211
- gr.HTML(
212
- """<hr style="border-top: 5px solid white;">"""
213
- )
214
-
215
- gr.Markdown("**Upload your URL,Documents**")
216
- with gr.Accordion("Upload Files", open = False):
217
- with gr.Row(elem_id="row-flex"):
218
- with gr.Row(elem_id="row-flex"):
219
- with gr.Column(scale=1,):
220
- file_url = gr.Textbox(label='file url :',show_label=True, placeholder="")
221
- with gr.Row(elem_id="row-flex"):
222
- with gr.Column(scale=1):
223
- file_output = gr.File()
224
- with gr.Column(scale=1):
225
- upload_button = gr.UploadButton(
226
- "Browse File", file_types=[".txt", ".pdf", ".doc", ".docx"],
227
- file_count = "multiple")
228
- with gr.Row():
229
- chatbot = gr.Chatbot([], elem_id="chatbot")
230
- with gr.Row():
231
- txt = gr.Textbox(
232
- label = "Question",
233
- show_label=True,
234
- placeholder="",
 
 
 
 
 
 
 
 
 
235
  )
236
- with gr.Row():
237
- clear_btn = gr.Button(value="Clear")
238
-
239
- txt_msg = txt.submit(self.add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
240
- self.answer_question, [txt,chatbot,state], chatbot
241
- )
242
- txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
243
-
244
- file_url.submit(self.upload_multiple_urls, file_url, [file_output, state])
245
- clear_btn.click(self.clear_function,[state],[])
246
- clear_btn.click(lambda: None, None, chatbot, queue=False)
247
- upload_button.upload(self.upload_file, upload_button, [file_output,state])
248
- demo.queue().launch(debug=True)
249
-
250
- if __name__=="__main__":
251
- chatbot = Chatbot()
252
- chatbot.gradio_interface()
 
1
+ class ChatDocumentQA:
2
+ def __init__(self) -> None:
3
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ def _get_empty_state(self) -> Dict[str, None]:
6
+ """Create an empty knowledge base."""
7
  return {"knowledge_base": None}
8
 
9
+ def _extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
10
+ """Extract text content from PDF files.
11
 
 
12
  Args:
13
+ file_paths (List[str]): List of file paths.
14
+
15
  Returns:
16
+ List[str]: Extracted text from the PDFs.
17
  """
18
+ docs = []
19
+ loaders = [UnstructuredFileLoader(file_obj, strategy="fast") for file_obj in file_paths]
20
+ for loader in loaders:
21
+ docs.extend(loader.load())
22
+ return docs
23
 
24
+ def _get_content_from_url(self, urls: str) -> List[str]:
25
+ """Fetch content from given URLs.
 
 
 
 
 
 
 
26
 
27
+ Args:
28
+ urls (str): Comma-separated URLs.
29
 
30
+ Returns:
31
+ List[str]: List of text content fetched from the URLs.
32
+ """
33
+ file_paths = []
34
+ for url in urls.split(','):
35
+ if validators.url(url):
36
+ headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',}
37
+ r = requests.get(url, headers=headers)
38
+ if r.status_code != 200:
39
+ raise ValueError("Check the url of your file; returned status code %s" % r.status_code)
40
+ content_type = r.headers.get("content-type")
41
+ file_extension = mimetypes.guess_extension(content_type)
42
+ temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
43
+ temp_file.write(r.content)
44
+ file_paths.append(temp_file.name)
45
 
46
+ docs = self._extract_text_from_pdfs(file_paths)
47
+ return docs
48
 
49
+ def _split_text_into_chunks(self, text: str) -> List[str]:
50
+ """Split text into smaller chunks.
51
 
 
 
52
  Args:
53
+ text (str): Input text to be split.
54
+
55
  Returns:
56
+ List[str]: List of smaller text chunks.
57
  """
58
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
59
 
60
+ chunks = text_splitter.split_documents(text)
 
61
 
62
+ return chunks
63
+ def _create_vector_store_from_text_chunks(self, text_chunks: List[str]) -> FAISS:
64
+ """Create a vector store from text chunks.
65
 
66
+ Args:
67
+ text_chunks (List[str]): List of text chunks.
68
 
69
+ Returns:
70
+ FAISS: Vector store created from the text chunks.
71
+ """
72
+ embeddings = OpenAIEmbeddings()
73
 
74
+ return FAISS.from_documents(documents=text_chunks, embedding=embeddings)
 
75
 
76
 
77
+ def _create_conversation_chain(self,vectorstore):
 
78
 
79
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
 
 
80
 
81
+ Chat History: {chat_history}
82
+ Follow Up Input: {question}
83
+ Standalone question:"""
84
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
85
 
86
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
87
 
88
+ llm = ChatOpenAI(temperature=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(),
91
+ condense_question_prompt=CONDENSE_QUESTION_PROMPT,
92
+ memory=memory)
93
 
94
+ def _get_documents_knowledge_base(self, file_paths: List[str]) -> Tuple[str, Dict[str, FAISS]]:
95
+ """Build knowledge base from uploaded files.
 
 
96
 
97
+ Args:
98
+ file_paths (List[str]): List of file paths.
99
 
100
+ Returns:
101
+ Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
102
+ """
103
+ file_path = file_paths[0].name
104
+ file_extension = os.path.splitext(file_path)[1]
105
+
106
+ if file_extension == '.pdf':
107
+ pdf_docs = [file_path.name for file_path in file_paths]
108
+ raw_text = self._extract_text_from_pdfs(pdf_docs)
109
+ text_chunks = self._split_text_into_chunks(raw_text)
110
+ vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
111
+ return "file uploaded", {"knowledge_base": vectorstore}
112
+ elif file_extension == '.csv':
113
+ df = pd.read_csv(file_path)
114
+ pd_agent = create_pandas_dataframe_agent(OpenAI(temperature=0), df, verbose=True)
115
+ tools = self.get_agent_tools(pd_agent)
116
+ memory,tools,prompt = self.create_memory_for_csv_qa(tools)
117
+ agent_chain = self.create_agent_chain_for_csv_qa(memory,tools,prompt)
118
+ return "file uploaded", {"knowledge_base": agent_chain}
119
+
120
+ else:
121
+ return "file uploaded", ""
122
+
123
+ def _get_urls_knowledge_base(self, urls: str) -> Tuple[str, Dict[str, FAISS]]:
124
+ """Build knowledge base from URLs.
125
 
 
 
126
  Args:
127
+ urls (str): Comma-separated URLs.
128
+
129
  Returns:
130
+ Tuple[str, Dict]: Tuple containing a status message and the knowledge base.
131
  """
132
+ webpage_text = self._get_content_from_url(urls)
133
+ text_chunks = self._split_text_into_chunks(webpage_text)
134
+ vectorstore = self._create_vector_store_from_text_chunks(text_chunks)
135
+ return "file uploaded", {"knowledge_base": vectorstore}
136
+
137
+ #************************
138
+ # csv qa
139
+ #************************
140
+ def get_agent_tools(self,agent):
141
+ # search = agent
142
+ tools = [
143
+ Tool(
144
+ name="dataframe qa",
145
+ func=agent.run,
146
+ description="useful for when you need to answer questions about table data and dataframe data",
147
+ )
148
+ ]
149
+ return tools
150
+
151
+ def create_memory_for_csv_qa(self,tools):
152
+ prefix = """Have a conversation with a human, answering the following questions about table data and dataframe data as best you can. You have access to the following tools:"""
153
+ suffix = """Begin!"
154
+
155
+ {chat_history}
156
+ Question: {input}
157
+ {agent_scratchpad}"""
158
+
159
+ prompt = ZeroShotAgent.create_prompt(
160
+ tools,
161
+ prefix=prefix,
162
+ suffix=suffix,
163
+ input_variables=["input", "chat_history", "agent_scratchpad"],
164
+ )
165
+ memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True)
166
 
167
+ return memory,tools,prompt
168
 
169
+ def create_agent_chain_for_csv_qa(self,memory,tools,prompt):
170
+
171
+ llm_chain = LLMChain(llm=OpenAI(temperature=0), prompt=prompt)
172
+ agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
173
+ agent_chain = AgentExecutor.from_agent_and_tools(
174
+ agent=agent, tools=tools, verbose=True, memory=memory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  )
176
 
177
+ return agent_chain
178
+
179
+ def _get_response(self, message: str, chat_history: List[Tuple[str, str]], state: Dict[str, FAISS],file_paths) -> Tuple[str, List[Tuple[str, str]]]:
180
+ """Get a response from the chatbot.
181
+
182
+ Args:
183
+ message (str): User's message/question.
184
+ chat_history (List[Tuple[str, str]]): List of chat history as tuples of (user_message, bot_response).
185
+ state (dict): State containing the knowledge base.
186
+
187
+ Returns:
188
+ Tuple[str, List[Tuple[str, str]]]: Tuple containing a status message and updated chat history.
189
+ """
190
+ try:
191
+ if file_paths:
192
+ file_path = file_paths[0].name
193
+ file_extension = os.path.splitext(file_path)[1]
194
+
195
+ if file_extension == ".pdf":
196
+ vectorstore = state["knowledge_base"]
197
+ chat = self._create_conversation_chain(vectorstore)
198
+ # user_ques = {"question": message}
199
+ print("chat_history",chat_history)
200
+ response = chat({"question": message,"chat_history": chat_history})
201
+ chat_history.append((message, response["answer"]))
202
+ return "", chat_history
203
+
204
+ elif file_extension == '.csv':
205
+ agent_chain = state["knowledge_base"]
206
+ response = agent_chain.run(input = message)
207
+ chat_history.append((message, response))
208
+ return "", chat_history
209
+ else:
210
+ vectorstore = state["knowledge_base"]
211
+ chat = self._create_conversation_chain(vectorstore)
212
+ # user_ques = {"question": message}
213
+ print("chat_history",chat_history)
214
+ response = chat({"question": message,"chat_history": chat_history})
215
+ chat_history.append((message, response["answer"]))
216
+ return "", chat_history
217
+ except:
218
+ chat_history.append((message, "Please Upload Document or URL"))
219
+ return "", chat_history
220
+
221
+ def gradio_interface(self) -> None:
222
+ """Create a Gradio interface for the chatbot."""
223
+ with gr.Blocks(theme='karthikeyan-adople/hudsonhayes-gray') as demo:
224
+ state = gr.State(self._get_empty_state())
225
+ chatbot = gr.Chatbot()
226
+ with gr.Row():
227
+ with gr.Column(scale=0.85):
228
+ msg = gr.Textbox(label="Question")
229
+ with gr.Column(scale=0.15):
230
+ file_output = gr.Textbox(label="File Status")
231
+ with gr.Row():
232
+ with gr.Column(scale=0.85):
233
+ clear = gr.ClearButton([msg, chatbot])
234
+ with gr.Column(scale=0.15):
235
+ upload_button = gr.UploadButton(
236
+ "Browse File",
237
+ file_types=[".txt", ".pdf", ".doc", ".docx"],
238
+ file_count="multiple", variant="primary"
239
  )
240
+ with gr.Row():
241
+ with gr.Column(scale=1):
242
+ input_url = gr.Textbox(label="urls")
243
+
244
+ input_url.submit(self._get_urls_knowledge_base, input_url, [file_output, state])
245
+ upload_button.upload(self._get_documents_knowledge_base, upload_button, [file_output, state])
246
+ msg.submit(self._get_response, [msg, chatbot, state,upload_button], [msg, chatbot])
247
+
248
+ demo.launch(debug=True)
249
+
250
+ if __name__ == "__main__":
251
+ chatdocumentqa = ChatDocumentQA()
252
+ chatdocumentqa.gradio_interface()