arslan-ahmed commited on
Commit
8e71274
·
1 Parent(s): 4da8f94

separate functions py file

Browse files
Files changed (3) hide show
  1. app.py +54 -299
  2. ttyd_consts.py +50 -0
  3. ttyd_functions.py +261 -0
app.py CHANGED
@@ -22,251 +22,22 @@ from urllib.parse import urlparse
22
  import mimetypes
23
  from pathlib import Path
24
  import tiktoken
 
 
25
 
26
-
27
- # Regex pattern to match a URL
28
- HTTP_URL_PATTERN = r'^http[s]*://.+'
29
-
30
- mimetypes.init()
31
- media_files = tuple([x for x in mimetypes.types_map if mimetypes.types_map[x].split('/')[0] in ['image', 'video', 'audio']])
32
- filter_strings = ['/email-protection#']
33
-
34
- def get_hyperlinks(url):
35
- try:
36
- reqs = requests.get(url)
37
- if not reqs.headers.get('Content-Type').startswith("text/html") or 400<=reqs.status_code<600:
38
- return []
39
- soup = BeautifulSoup(reqs.text, 'html.parser')
40
- except Exception as e:
41
- print(e)
42
- return []
43
-
44
- hyperlinks = []
45
- for link in soup.find_all('a', href=True):
46
- hyperlinks.append(link.get('href'))
47
-
48
- return hyperlinks
49
-
50
-
51
- # Function to get the hyperlinks from a URL that are within the same domain
52
- def get_domain_hyperlinks(local_domain, url):
53
- clean_links = []
54
- for link in set(get_hyperlinks(url)):
55
- clean_link = None
56
-
57
- # If the link is a URL, check if it is within the same domain
58
- if re.search(HTTP_URL_PATTERN, link):
59
- # Parse the URL and check if the domain is the same
60
- url_obj = urlparse(link)
61
- if url_obj.netloc == local_domain:
62
- clean_link = link
63
-
64
- # If the link is not a URL, check if it is a relative link
65
- else:
66
- if link.startswith("/"):
67
- link = link[1:]
68
- elif link.startswith(("#", '?', 'mailto:')):
69
- continue
70
-
71
- if 'wp-content/uploads' in url:
72
- clean_link = url+ "/" + link
73
- else:
74
- clean_link = "https://" + local_domain + "/" + link
75
-
76
- if clean_link is not None:
77
- clean_link = clean_link.strip().rstrip('/').replace('/../', '/')
78
-
79
- if not any(x in clean_link for x in filter_strings):
80
- clean_links.append(clean_link)
81
-
82
- # Return the list of hyperlinks that are within the same domain
83
- return list(set(clean_links))
84
-
85
- # this function will get you a list of all the URLs from the base URL
86
- def crawl(url, local_domain, prog=None):
87
- # Create a queue to store the URLs to crawl
88
- queue = deque([url])
89
-
90
- # Create a set to store the URLs that have already been seen (no duplicates)
91
- seen = set([url])
92
-
93
- # While the queue is not empty, continue crawling
94
- while queue:
95
- # Get the next URL from the queue
96
- url_pop = queue.pop()
97
- # Get the hyperlinks from the URL and add them to the queue
98
- for link in get_domain_hyperlinks(local_domain, url_pop):
99
- if link not in seen:
100
- queue.append(link)
101
- seen.add(link)
102
- if len(seen)>=100:
103
- return seen
104
- if prog is not None: prog(1, desc=f'Crawling: {url_pop}')
105
-
106
- return seen
107
-
108
-
109
- def ingestURL(documents, url, crawling=True, prog=None):
110
- url = url.rstrip('/')
111
- # Parse the URL and get the domain
112
- local_domain = urlparse(url).netloc
113
- if not (local_domain and url.startswith('http')):
114
- return documents
115
- print('Loading URL', url)
116
- if crawling:
117
- # crawl to get other webpages from this URL
118
- if prog is not None: prog(0, desc=f'Crawling: {url}')
119
- links = crawl(url, local_domain, prog)
120
- if prog is not None: prog(1, desc=f'Crawling: {url}')
121
- else:
122
- links = set([url])
123
- # separate pdf and other links
124
- c_links, pdf_links = [], []
125
- for x in links:
126
- if x.endswith('.pdf'):
127
- pdf_links.append(x)
128
- elif not x.endswith(media_files):
129
- c_links.append(x)
130
-
131
- # Clean links loader using WebBaseLoader
132
- if prog is not None: prog(0.5, desc=f'Ingesting: {url}')
133
- if c_links:
134
- loader = WebBaseLoader(list(c_links))
135
- documents.extend(loader.load())
136
-
137
- # remote PDFs loader
138
- for pdf_link in list(pdf_links):
139
- loader = PyMuPDFLoader(pdf_link)
140
- doc = loader.load()
141
- for x in doc:
142
- x.metadata['source'] = loader.source
143
- documents.extend(doc)
144
-
145
- return documents
146
-
147
- def ingestFiles(documents, files_list, prog=None):
148
- for fPath in files_list:
149
- doc = None
150
- if fPath.endswith('.pdf'):
151
- doc = PyMuPDFLoader(fPath).load()
152
- elif fPath.endswith('.txt'):
153
- doc = TextLoader(fPath).load()
154
- elif fPath.endswith(('.doc', 'docx')):
155
- doc = Docx2txtLoader(fPath).load()
156
- elif 'WhatsApp Chat with' in fPath and fPath.endswith('.csv'):
157
- doc = WhatsAppChatLoader(fPath).load()
158
- else:
159
- pass
160
-
161
- if doc is not None and doc[0].page_content:
162
- if prog is not None: prog(1, desc='Loaded file: '+fPath.rsplit('/')[0])
163
- print('Loaded file:', fPath)
164
- documents.extend(doc)
165
- return documents
166
-
167
-
168
- def data_ingestion(inputDir=None, file_list=[], waDir=None, url_list=[], prog=None):
169
- documents = []
170
- # Ingestion from Input Directory
171
- if inputDir is not None:
172
- files = [str(x) for x in Path(inputDir).glob('**/*')]
173
- documents = ingestFiles(documents, files)
174
- if file_list:
175
- documents = ingestFiles(documents, file_list, prog)
176
- # Ingestion of whatsapp chats - Convert Whatsapp TXT files to CSV using https://whatstk.streamlit.app/
177
- if waDir is not None:
178
- for fPath in [str(x) for x in Path(waDir).glob('**/*.csv')]:
179
- waDoc = WhatsAppChatLoader(fPath).load()
180
- if waDoc[0].page_content:
181
- print('Loaded whatsapp file:', fPath)
182
- documents.extend(waDoc)
183
- # Ingestion from URLs - also try https://python.langchain.com/docs/integrations/document_loaders/recursive_url_loader
184
- if url_list:
185
- for url in url_list:
186
- documents = ingestURL(documents, url, prog=prog)
187
-
188
-
189
- # Cleanup documents
190
- for x in documents:
191
- if 'WhatsApp Chat with ' not in x.metadata['source']:
192
- x.page_content = x.page_content.strip().replace('\n', ' ').replace('\\n', ' ').replace(' ', ' ')
193
-
194
- print(f"Total number of documents: {len(documents)}")
195
- return documents
196
-
197
-
198
- def split_docs(documents):
199
- # Splitting and Chunks
200
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=250) # default chunk size of 4000 makes around 1k tokens per doc. with k=4, this means 4k tokens input to LLM.
201
- docs = text_splitter.split_documents(documents)
202
- return docs
203
-
204
- # used for Hardcoded documents only - not uploaded by user
205
- def getVectorStore(openApiKey, documents, chromaClient=None):
206
- docs = split_docs(documents)
207
- # Embeddings
208
- embeddings = OpenAIEmbeddings(openai_api_key=openApiKey)
209
- # create chroma client if doesnt exist
210
- if chromaClient is None:
211
- chromaClient = Chroma(embedding_function=embeddings)
212
- # clear chroma client before adding new docs
213
- if chromaClient._collection.count()>0:
214
- chromaClient.delete(chromaClient.get()['ids'])
215
- # add new docs to chroma client
216
- chromaClient.add_documents(docs)
217
- print('vectorstore count:',chromaClient._collection.count(), 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
218
-
219
- return chromaClient
220
-
221
-
222
- def getSourcesFromMetadata(metadata, sourceOnly=True, sepFileUrl=True):
223
- # metadata: list of metadata dict from all documents
224
- setSrc = set()
225
- for x in metadata:
226
- metadataText = '' # we need to convert each metadata dict into a string format. This string will be added to a set
227
- if x is not None:
228
- # extract source first, and then extract all other items
229
- source = x['source']
230
- source = source.rsplit('/',1)[-1] if 'http' not in source else source
231
- notSource = []
232
- for k,v in x.items():
233
- if v is not None and k!='source' and k in ['page', 'title']:
234
- notSource.extend([f"{k}: {v}"])
235
- metadataText = ', '.join([f'source: {source}'] + notSource) if sourceOnly==False else source
236
- setSrc.add(metadataText)
237
-
238
- if sepFileUrl:
239
- src_files = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted([x for x in setSrc if 'http' not in x], key=str.casefold))]))
240
- src_urls = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted([x for x in setSrc if 'http' in x], key=str.casefold))]))
241
-
242
- src_files = 'Files:\n'+src_files if src_files else ''
243
- src_urls = 'URLs:\n'+src_urls if src_urls else ''
244
- newLineSep = '\n\n' if src_files and src_urls else ''
245
-
246
- return src_files + newLineSep + src_urls , len(setSrc)
247
- else:
248
- src_docs = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted(list(setSrc), key=str.casefold))]))
249
- return src_docs, len(setSrc)
250
-
251
- def num_tokens_from_string(string, encoding_name = "cl100k_base"):
252
- """Returns the number of tokens in a text string."""
253
- encoding = tiktoken.get_encoding(encoding_name)
254
- num_tokens = len(encoding.encode(string))
255
- return num_tokens
256
  ###############################################################################################
257
 
258
- # Hardcoded Documents
 
259
 
260
- # documents = []
261
-
262
- # # Data Ingestion - take list of documents
263
- # documents = data_ingestion(inputDir= '../reports/',waDir = '../whatsapp-exports/')
264
- # full_text = ''.join([x.page_content for x in documents])
265
- # print('Full Text Len:', len(full_text), 'Num tokens:', num_tokens_from_string(full_text))
266
-
267
- # # Embeddings
268
- # vectorstore = getVectorStore(os.getenv("OPENAI_API_KEY"), documents)
269
 
 
 
 
 
 
270
 
271
 
272
  ###############################################################################################
@@ -279,7 +50,7 @@ def generateExamples(api_key_st, vsDict_st):
279
  qa_chain = RetrievalQA.from_llm(llm=ChatOpenAI(openai_api_key=api_key_st, temperature=0),
280
  retriever=vsDict_st['chromaClient'].as_retriever(search_type="similarity", search_kwargs={"k": 4}))
281
 
282
- result = qa_chain({'query': 'Generate top 5 questions that I can ask about this data. Questions should be very precise and short, ideally less than 10 words.'})
283
  answer = result['result'].strip('\n')
284
  grSamples = [[]]
285
  if answer.startswith('1. '):
@@ -291,24 +62,24 @@ def generateExamples(api_key_st, vsDict_st):
291
 
292
  # initialize chatbot function sets the QA Chain, and also sets/updates any other components to start chatting. updateQaChain function only updates QA chain and will be called whenever Adv Settings are updated.
293
  def initializeChatbot(temp, k, modelName, stdlQs, api_key_st, vsDict_st, progress=gr.Progress()):
294
- progress(0.1, 'Analyzing your documents, please wait...')
295
  qa_chain_st = updateQaChain(temp, k, modelName, stdlQs, api_key_st, vsDict_st)
296
- progress(0.5, 'Analyzing your documents, please wait...')
297
  #generate welcome message
298
- result = qa_chain_st({'question': 'Write a short welcome message to the user. Describe the document with a brief overview and short summary or any highlights. If this document is about a person, mention his name instead of using pronouns. After this, you should include top 3 example questions that user can ask about this data. Make sure you have got answers to those questions within the data. Your response should be short and precise. Format of your response should be Summary: {summary} \n\n\n Example Questions: {examples}', 'chat_history':[]})
 
299
  # exSamples = generateExamples(api_key_st, vsDict_st)
300
  # exSamples_vis = True if exSamples[0] else False
301
-
302
- return qa_chain_st, btn.update(interactive=True), initChatbot_btn.update('Chatbot ready. Now visit the chatbot Tab.', interactive=False)\
303
- , status_tb.update(), gr.Tabs.update(selected='cb'), chatbot.update(value=[('', result['answer'])])
304
 
 
 
305
 
306
 
307
  def setApiKey(api_key):
308
  if api_key==os.getenv("TEMP_PWD") and os.getenv("OPENAI_API_KEY") is not None:
309
  api_key=os.getenv("OPENAI_API_KEY")
310
  try:
311
- api_key='Null' if api_key is None or api_key=='' else api_key
312
  openai.Model.list(api_key=api_key) # test the API key
313
  api_key_st = api_key
314
 
@@ -317,50 +88,45 @@ def setApiKey(api_key):
317
  return aKey_tb.update(str(e), type='text'), *[x.update() for x in [aKey_btn, api_key_state]]
318
 
319
  # convert user uploaded data to vectorstore
320
- def userData_vecStore(userFiles, userUrls, api_key_st, vsDict_st={}, progress=gr.Progress()):
321
  opComponents = [data_ingest_btn, upload_fb, urls_tb]
 
322
  file_paths = []
323
  documents = []
324
  if userFiles is not None:
325
  if not isinstance(userFiles, list): userFiles = [userFiles]
326
  file_paths = [file.name for file in userFiles]
327
  userUrls = [x.strip() for x in userUrls.split(",")] if userUrls else []
 
328
  documents = data_ingestion(file_list=file_paths, url_list=userUrls, prog=progress)
329
  if documents:
330
  for file in file_paths:
331
  os.remove(file)
332
  else:
333
  return {}, '', *[x.update() for x in opComponents]
334
-
335
  # Splitting and Chunks
336
  docs = split_docs(documents)
337
  # Embeddings
338
  try:
339
- api_key_st='Null' if api_key_st is None or api_key_st=='' else api_key_st
340
  openai.Model.list(api_key=api_key_st) # test the API key
341
  embeddings = OpenAIEmbeddings(openai_api_key=api_key_st)
342
  except Exception as e:
343
  return {}, str(e), *[x.update() for x in opComponents]
344
 
345
  progress(0.5, 'Creating Vector Database')
346
- # create chroma client if doesnt exist
347
- if vsDict_st.get('chromaDir') is None:
348
- vsDict_st['chromaDir'] = str(uuid.uuid1())
349
- vsDict_st['chromaClient'] = Chroma(embedding_function=embeddings, persist_directory=vsDict_st['chromaDir'])
350
- # clear chroma client before adding new docs
351
- if vsDict_st['chromaClient']._collection.count()>0:
352
- vsDict_st['chromaClient'].delete(vsDict_st['chromaClient'].get()['ids'])
353
- # add new docs to chroma client
354
- vsDict_st['chromaClient'].add_documents(docs)
355
- print('vectorstore count:',vsDict_st['chromaClient']._collection.count(), 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
356
-
357
- op_docs_str = getSourcesFromMetadata(vsDict_st['chromaClient'].get()['metadatas'])
358
- op_docs_str = str(op_docs_str[1]) + ' document(s) successfully loaded in vector store.'+'\n\n' + op_docs_str[0]
359
  progress(1, 'Data loaded')
360
- return vsDict_st, op_docs_str, *[x.update(interactive=False) for x in [data_ingest_btn, upload_fb]], urls_tb.update(interactive=False, placeholder='')
361
 
362
  # just update the QA Chain, no updates to any UI
363
  def updateQaChain(temp, k, modelName, stdlQs, api_key_st, vsDict_st):
 
 
364
  modelName = modelName.split('(')[0].strip() # so we can provide any info in brackets
365
  # check if the input model is chat model or legacy model
366
  try:
@@ -417,16 +183,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray
417
 
418
 
419
  # Setup the Gradio Layout
420
- gr.Markdown(
421
- """
422
- ## Chat with your documents and websites<br>
423
- Step 1) Enter your OpenAI API Key, and click Submit.<br>
424
- Step 2) Upload your documents and/or enter URLs, then click Load Data.<br>
425
- Step 3) Once data is loaded, click Initialize Chatbot (at the bottom of the page) to start talking to your data.<br>
426
-
427
- Your documents should be semantically similar (covering related topics or having the similar meaning) in order to get the best results.
428
- You may also play around with Advanced Settings, like changing the model name and parameters.
429
- """)
430
  with gr.Tabs() as tabs:
431
  with gr.Tab('Initialization', id='init'):
432
  with gr.Row():
@@ -435,14 +192,14 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray
435
  , info='You can find OpenAI API key at https://platform.openai.com/account/api-keys'\
436
  , placeholder='Enter your API key here and hit enter to begin chatting')
437
  aKey_btn = gr.Button("Submit API Key")
438
- with gr.Row():
439
  upload_fb = gr.Files(scale=5, label="Upload (multiple) Files - pdf/txt/docx supported", file_types=['.doc', '.docx', 'text', '.pdf', '.csv'])
440
  urls_tb = gr.Textbox(scale=5, label="Enter URLs starting with https (comma separated)"\
441
- , info='Upto 100 domain webpages will be crawled for each URL. You can also enter online PDF files.'\
442
- , placeholder='https://example.com, https://another.com, https://anyremotedocument.pdf')
443
  data_ingest_btn = gr.Button("Load Data")
444
- status_tb = gr.TextArea(label='Status bar', show_label=False)
445
- initChatbot_btn = gr.Button("Initialize Chatbot")
446
 
447
  with gr.Tab('Chatbot', id='cb'):
448
  with gr.Row():
@@ -450,24 +207,22 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray
450
  srcDocs = gr.TextArea(label="References")
451
  msg = gr.Textbox(label="User Input",placeholder="Type your questions here")
452
  with gr.Row():
453
- btn = gr.Button("Send Message", interactive=False)
454
  clear = gr.ClearButton(components=[msg, chatbot, srcDocs], value="Clear chat history")
455
- with gr.Row():
456
- # exp_comp = gr.Dataset(scale=0.7, samples=[['123'],['456'], ['123'],['456'],['456']], components=[msg], label='Examples (auto generated by LLM)', visible=False)
457
- # gr.Examples(examples=exps, inputs=msg)
458
- with gr.Accordion("Advance Settings - click to expand", open=False):
459
- with gr.Row():
460
  temp_sld = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.7, label="Temperature", info='Sampling temperature to use when calling LLM. Defaults to 0.7')
461
  k_sld = gr.Slider(minimum=1, maximum=10, step=1, value=4, label="K", info='Number of relavant documents to return from Vector Store. Defaults to 4')
462
  model_dd = gr.Dropdown(label='Model Name'\
463
- , choices=['gpt-3.5-turbo', 'gpt-3.5-turbo-16k', 'gpt-4', 'text-davinci-003 (Legacy)', 'text-curie-001 (Legacy)', 'babbage-002']\
464
- , value='gpt-3.5-turbo', allow_custom_value=True\
465
- , info='You can also input any OpenAI model name, compatible with /v1/completions or /v1/chat/completions endpoint. Details: https://platform.openai.com/docs/models/')
466
- stdlQs_rb = gr.Radio(label='Standalone Question', info='Standalone question is a new rephrased question generated based on your original question and chat history'\
467
- , type='index', value='Retrieve relavant docs using standalone question, send original question to LLM'\
468
- , choices=['Retrieve relavant docs using original question, send original question to LLM (Chat history not considered)'\
469
- , 'Retrieve relavant docs using standalone question, send original question to LLM'\
470
- , 'Retrieve relavant docs using standalone question, send standalone question to LLM'])
471
 
472
  ### Setup the Gradio Event Listeners
473
 
@@ -477,17 +232,17 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue='orange', secondary_hue='gray
477
  aKey_tb.submit(**aKey_btn_args)
478
 
479
  # Data Ingest Button
480
- data_ingest_btn.click(userData_vecStore, [upload_fb, urls_tb, api_key_state, chromaVS_state], [chromaVS_state, status_tb, data_ingest_btn, upload_fb, urls_tb])
481
 
482
  # Adv Settings
483
  advSet_args = {'fn':updateQaChain, 'inputs':[temp_sld, k_sld, model_dd, stdlQs_rb, api_key_state, chromaVS_state], 'outputs':[qa_state]}
484
- temp_sld.change(**advSet_args)
485
- k_sld.change(**advSet_args)
486
  model_dd.change(**advSet_args)
487
  stdlQs_rb.change(**advSet_args)
488
-
489
  # Initialize button
490
- initChatbot_btn.click(initializeChatbot, [temp_sld, k_sld, model_dd, stdlQs_rb, api_key_state, chromaVS_state], [qa_state, btn, initChatbot_btn, status_tb, tabs, chatbot])
491
 
492
  # Chatbot submit button
493
  chat_btn_args = {'fn':respond, 'inputs':[msg, chatbot, qa_state], 'outputs':[msg, chatbot, srcDocs, btn]}
 
22
  import mimetypes
23
  from pathlib import Path
24
  import tiktoken
25
+ from ttyd_functions import *
26
+ from ttyd_consts import *
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  ###############################################################################################
29
 
30
+ # You want to hardcode Documents or take it from UI?
31
+ UiAddData = False
32
 
33
+ if UiAddData: # take input data from UI
34
+ md_title = md_title_general
 
 
 
 
 
 
 
35
 
36
+ else: # provide paths to the data
37
+ url_list = ['https://www.nustianusa.org', 'https://www.nustian.ca']
38
+ # local vector store as opposed to gradio state vector store
39
+ vsDict_hard = localData_vecStore(os.getenv("OPENAI_API_KEY"), url_list=url_list)
40
+ md_title = md_title_nustian
41
 
42
 
43
  ###############################################################################################
 
50
  qa_chain = RetrievalQA.from_llm(llm=ChatOpenAI(openai_api_key=api_key_st, temperature=0),
51
  retriever=vsDict_st['chromaClient'].as_retriever(search_type="similarity", search_kwargs={"k": 4}))
52
 
53
+ result = qa_chain({'query': exp_query})
54
  answer = result['result'].strip('\n')
55
  grSamples = [[]]
56
  if answer.startswith('1. '):
 
62
 
63
  # initialize chatbot function sets the QA Chain, and also sets/updates any other components to start chatting. updateQaChain function only updates QA chain and will be called whenever Adv Settings are updated.
64
  def initializeChatbot(temp, k, modelName, stdlQs, api_key_st, vsDict_st, progress=gr.Progress()):
65
+ progress(0.1, waitText_initialize)
66
  qa_chain_st = updateQaChain(temp, k, modelName, stdlQs, api_key_st, vsDict_st)
67
+ progress(0.5, waitText_initialize)
68
  #generate welcome message
69
+ result = qa_chain_st({'question': initialize_prompt, 'chat_history':[]})
70
+
71
  # exSamples = generateExamples(api_key_st, vsDict_st)
72
  # exSamples_vis = True if exSamples[0] else False
 
 
 
73
 
74
+ return qa_chain_st, btn.update(interactive=True), initChatbot_btn.update('Chatbot ready. Now visit the chatbot Tab.', interactive=False)\
75
+ , aKey_tb.update(), gr.Tabs.update(selected='cb'), chatbot.update(value=[('', result['answer'])])
76
 
77
 
78
  def setApiKey(api_key):
79
  if api_key==os.getenv("TEMP_PWD") and os.getenv("OPENAI_API_KEY") is not None:
80
  api_key=os.getenv("OPENAI_API_KEY")
81
  try:
82
+ # api_key='Null' if api_key is None or api_key=='' else api_key
83
  openai.Model.list(api_key=api_key) # test the API key
84
  api_key_st = api_key
85
 
 
88
  return aKey_tb.update(str(e), type='text'), *[x.update() for x in [aKey_btn, api_key_state]]
89
 
90
  # convert user uploaded data to vectorstore
91
+ def uiData_vecStore(userFiles, userUrls, api_key_st, vsDict_st={}, progress=gr.Progress()):
92
  opComponents = [data_ingest_btn, upload_fb, urls_tb]
93
+ # parse user data
94
  file_paths = []
95
  documents = []
96
  if userFiles is not None:
97
  if not isinstance(userFiles, list): userFiles = [userFiles]
98
  file_paths = [file.name for file in userFiles]
99
  userUrls = [x.strip() for x in userUrls.split(",")] if userUrls else []
100
+ #create documents
101
  documents = data_ingestion(file_list=file_paths, url_list=userUrls, prog=progress)
102
  if documents:
103
  for file in file_paths:
104
  os.remove(file)
105
  else:
106
  return {}, '', *[x.update() for x in opComponents]
 
107
  # Splitting and Chunks
108
  docs = split_docs(documents)
109
  # Embeddings
110
  try:
111
+ # api_key_st='Null' if api_key_st is None or api_key_st=='' else api_key_st
112
  openai.Model.list(api_key=api_key_st) # test the API key
113
  embeddings = OpenAIEmbeddings(openai_api_key=api_key_st)
114
  except Exception as e:
115
  return {}, str(e), *[x.update() for x in opComponents]
116
 
117
  progress(0.5, 'Creating Vector Database')
118
+ vsDict_st = getVsDict(embeddings, docs, vsDict_st)
119
+ # get sources from metadata
120
+ src_str = getSourcesFromMetadata(vsDict_st['chromaClient'].get()['metadatas'])
121
+ src_str = str(src_str[1]) + ' source document(s) successfully loaded in vector store.'+'\n\n' + src_str[0]
122
+
 
 
 
 
 
 
 
 
123
  progress(1, 'Data loaded')
124
+ return vsDict_st, src_str, *[x.update(interactive=False) for x in [data_ingest_btn, upload_fb]], urls_tb.update(interactive=False, placeholder='')
125
 
126
  # just update the QA Chain, no updates to any UI
127
  def updateQaChain(temp, k, modelName, stdlQs, api_key_st, vsDict_st):
128
+ # if we are not adding data from ui, then use vsDict_hard as vectorstore
129
+ if vsDict_st=={} and not UiAddData: vsDict_st=vsDict_hard
130
  modelName = modelName.split('(')[0].strip() # so we can provide any info in brackets
131
  # check if the input model is chat model or legacy model
132
  try:
 
183
 
184
 
185
  # Setup the Gradio Layout
186
+ gr.Markdown(md_title)
 
 
 
 
 
 
 
 
 
187
  with gr.Tabs() as tabs:
188
  with gr.Tab('Initialization', id='init'):
189
  with gr.Row():
 
192
  , info='You can find OpenAI API key at https://platform.openai.com/account/api-keys'\
193
  , placeholder='Enter your API key here and hit enter to begin chatting')
194
  aKey_btn = gr.Button("Submit API Key")
195
+ with gr.Row(visible=UiAddData):
196
  upload_fb = gr.Files(scale=5, label="Upload (multiple) Files - pdf/txt/docx supported", file_types=['.doc', '.docx', 'text', '.pdf', '.csv'])
197
  urls_tb = gr.Textbox(scale=5, label="Enter URLs starting with https (comma separated)"\
198
+ , info=url_tb_info\
199
+ , placeholder=url_tb_ph)
200
  data_ingest_btn = gr.Button("Load Data")
201
+ status_tb = gr.TextArea(label='Status bar', show_label=False, visible=UiAddData)
202
+ initChatbot_btn = gr.Button("Initialize Chatbot", variant="primary")
203
 
204
  with gr.Tab('Chatbot', id='cb'):
205
  with gr.Row():
 
207
  srcDocs = gr.TextArea(label="References")
208
  msg = gr.Textbox(label="User Input",placeholder="Type your questions here")
209
  with gr.Row():
210
+ btn = gr.Button("Send Message", interactive=False, variant="primary")
211
  clear = gr.ClearButton(components=[msg, chatbot, srcDocs], value="Clear chat history")
212
+ # exp_comp = gr.Dataset(scale=0.7, samples=[['123'],['456'], ['123'],['456'],['456']], components=[msg], label='Examples (auto generated by LLM)', visible=False)
213
+ # gr.Examples(examples=exps, inputs=msg)
214
+ with gr.Accordion("Advance Settings - click to expand", open=False):
215
+ with gr.Row():
216
+ with gr.Column():
217
  temp_sld = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.7, label="Temperature", info='Sampling temperature to use when calling LLM. Defaults to 0.7')
218
  k_sld = gr.Slider(minimum=1, maximum=10, step=1, value=4, label="K", info='Number of relavant documents to return from Vector Store. Defaults to 4')
219
  model_dd = gr.Dropdown(label='Model Name'\
220
+ , choices=model_dd_choices\
221
+ , value=model_dd_choices[0], allow_custom_value=True\
222
+ , info=model_dd_info)
223
+ stdlQs_rb = gr.Radio(label='Standalone Question', info=stdlQs_rb_info\
224
+ , type='index', value=stdlQs_rb_choices[1]\
225
+ , choices=stdlQs_rb_choices)
 
 
226
 
227
  ### Setup the Gradio Event Listeners
228
 
 
232
  aKey_tb.submit(**aKey_btn_args)
233
 
234
  # Data Ingest Button
235
+ data_ingest_btn.click(uiData_vecStore, [upload_fb, urls_tb, api_key_state, chromaVS_state], [chromaVS_state, status_tb, data_ingest_btn, upload_fb, urls_tb])
236
 
237
  # Adv Settings
238
  advSet_args = {'fn':updateQaChain, 'inputs':[temp_sld, k_sld, model_dd, stdlQs_rb, api_key_state, chromaVS_state], 'outputs':[qa_state]}
239
+ temp_sld.release(**advSet_args)
240
+ k_sld.release(**advSet_args)
241
  model_dd.change(**advSet_args)
242
  stdlQs_rb.change(**advSet_args)
243
+
244
  # Initialize button
245
+ initChatbot_btn.click(initializeChatbot, [temp_sld, k_sld, model_dd, stdlQs_rb, api_key_state, chromaVS_state], [qa_state, btn, initChatbot_btn, aKey_tb, tabs, chatbot])
246
 
247
  # Chatbot submit button
248
  chat_btn_args = {'fn':respond, 'inputs':[msg, chatbot, qa_state], 'outputs':[msg, chatbot, srcDocs, btn]}
ttyd_consts.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exp_query = 'Generate top 5 questions that I can ask about this data. Questions should be very precise and short, ideally less than 10 words.'
2
+
3
+ waitText_initialize = 'Preparing the documents, please wait...'
4
+
5
+ initialize_prompt = 'Write a short welcome message to the user. Describe the documents with a brief overview including short summary or any highlights.\
6
+ If these documents are about a person, mention his name instead of using pronouns. After describing the overview, you should mention top 3 example questions that the user can ask about this data.\
7
+ Your response should be short and precise. Format of your response should be Description:\n{description} \n\n Example Questions:\n{examples}'
8
+
9
+ nustian_exps = ['Tell me about NUSTIAN',
10
+ 'Who is the NUSTIAN regional lead for Silicon Valley?',
11
+ 'Tell me details about NUSTIAN coaching program.',
12
+ 'How can we donate to NUSTIAN fundraiser?',
13
+ 'Who is the president of NUSTIAN?',
14
+ "What are top five missions of NUSTIAN?",
15
+ ]
16
+
17
+ stdlQs_rb_info = 'Standalone question is a new rephrased question generated based on your original question and chat history'
18
+
19
+ stdlQs_rb_choices = ['Retrieve relavant docs using original question, send original question to LLM (Chat history not considered)'\
20
+ , 'Retrieve relavant docs using standalone question, send original question to LLM'\
21
+ , 'Retrieve relavant docs using standalone question, send standalone question to LLM']
22
+
23
+
24
+
25
+ model_dd_info = 'You can also input any OpenAI model name, compatible with /v1/completions or /v1/chat/completions endpoint. Details: https://platform.openai.com/docs/models/'
26
+
27
+ model_dd_choices = ['gpt-3.5-turbo', 'gpt-3.5-turbo-16k', 'gpt-4', 'text-davinci-003 (Legacy)', 'text-curie-001 (Legacy)', 'babbage-002']
28
+
29
+ url_tb_info = 'Upto 100 domain webpages will be crawled for each URL. You can also enter online PDF files.'
30
+
31
+ url_tb_ph = 'https://example.com, https://another.com, https://anyremotedocument.pdf'
32
+
33
+
34
+ md_title_general = """
35
+ ## Chat with your documents and websites<br>
36
+ Step 1) Enter your OpenAI API Key, and click Submit.<br>
37
+ Step 2) Upload your documents and/or enter URLs, then click Load Data.<br>
38
+ Step 3) Once data is loaded, click Initialize Chatbot (at the bottom of the page) to start talking to your data.<br>
39
+
40
+ Your documents should be semantically similar (covering related topics or having the similar meaning) in order to get the best results.
41
+ You may also play around with Advanced Settings, like changing the model name and parameters.
42
+ """
43
+
44
+ md_title_nustian = """
45
+ ## Chat with NUSTIAN website<br>
46
+ Step 1) Submit your OpenAI API Key.<br>
47
+ Step 2) Click Initialize Chatbot to start sending messages.<br>
48
+
49
+ You may also play around with Advanced Settings, like changing the model name and parameters.
50
+ """
ttyd_functions.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import datetime
3
+ import openai
4
+ import uuid
5
+ import gradio as gr
6
+ from langchain.embeddings import OpenAIEmbeddings
7
+ from langchain.vectorstores import Chroma
8
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from langchain.chains import RetrievalQA
11
+
12
+ import os
13
+ from langchain.chat_models import ChatOpenAI
14
+ from langchain import OpenAI
15
+ from langchain.document_loaders import WebBaseLoader, TextLoader, Docx2txtLoader, PyMuPDFLoader
16
+ from whatsapp_chat_custom import WhatsAppChatLoader # use this instead of from langchain.document_loaders import WhatsAppChatLoader
17
+
18
+ from collections import deque
19
+ import re
20
+ from bs4 import BeautifulSoup
21
+ import requests
22
+ from urllib.parse import urlparse
23
+ import mimetypes
24
+ from pathlib import Path
25
+ import tiktoken
26
+
27
+ # Regex pattern to match a URL
28
+ HTTP_URL_PATTERN = r'^http[s]*://.+'
29
+
30
+ mimetypes.init()
31
+ media_files = tuple([x for x in mimetypes.types_map if mimetypes.types_map[x].split('/')[0] in ['image', 'video', 'audio']])
32
+ filter_strings = ['/email-protection#']
33
+
34
+ def get_hyperlinks(url):
35
+ try:
36
+ reqs = requests.get(url)
37
+ if not reqs.headers.get('Content-Type').startswith("text/html") or 400<=reqs.status_code<600:
38
+ return []
39
+ soup = BeautifulSoup(reqs.text, 'html.parser')
40
+ except Exception as e:
41
+ print(e)
42
+ return []
43
+
44
+ hyperlinks = []
45
+ for link in soup.find_all('a', href=True):
46
+ hyperlinks.append(link.get('href'))
47
+
48
+ return hyperlinks
49
+
50
+
51
+ # Function to get the hyperlinks from a URL that are within the same domain
52
+ def get_domain_hyperlinks(local_domain, url):
53
+ clean_links = []
54
+ for link in set(get_hyperlinks(url)):
55
+ clean_link = None
56
+
57
+ # If the link is a URL, check if it is within the same domain
58
+ if re.search(HTTP_URL_PATTERN, link):
59
+ # Parse the URL and check if the domain is the same
60
+ url_obj = urlparse(link)
61
+ if url_obj.netloc == local_domain:
62
+ clean_link = link
63
+
64
+ # If the link is not a URL, check if it is a relative link
65
+ else:
66
+ if link.startswith("/"):
67
+ link = link[1:]
68
+ elif link.startswith(("#", '?', 'mailto:')):
69
+ continue
70
+
71
+ if 'wp-content/uploads' in url:
72
+ clean_link = url+ "/" + link
73
+ else:
74
+ clean_link = "https://" + local_domain + "/" + link
75
+
76
+ if clean_link is not None:
77
+ clean_link = clean_link.strip().rstrip('/').replace('/../', '/')
78
+
79
+ if not any(x in clean_link for x in filter_strings):
80
+ clean_links.append(clean_link)
81
+
82
+ # Return the list of hyperlinks that are within the same domain
83
+ return list(set(clean_links))
84
+
85
+ # this function will get you a list of all the URLs from the base URL
86
+ def crawl(url, local_domain, prog=None):
87
+ # Create a queue to store the URLs to crawl
88
+ queue = deque([url])
89
+
90
+ # Create a set to store the URLs that have already been seen (no duplicates)
91
+ seen = set([url])
92
+
93
+ # While the queue is not empty, continue crawling
94
+ while queue:
95
+ # Get the next URL from the queue
96
+ url_pop = queue.pop()
97
+ # Get the hyperlinks from the URL and add them to the queue
98
+ for link in get_domain_hyperlinks(local_domain, url_pop):
99
+ if link not in seen:
100
+ queue.append(link)
101
+ seen.add(link)
102
+ if len(seen)>=100:
103
+ return seen
104
+ if prog is not None: prog(1, desc=f'Crawling: {url_pop}')
105
+
106
+ return seen
107
+
108
+
109
+ def ingestURL(documents, url, crawling=True, prog=None):
110
+ url = url.rstrip('/')
111
+ # Parse the URL and get the domain
112
+ local_domain = urlparse(url).netloc
113
+ if not (local_domain and url.startswith('http')):
114
+ return documents
115
+ print('Loading URL', url)
116
+ if crawling:
117
+ # crawl to get other webpages from this URL
118
+ if prog is not None: prog(0, desc=f'Crawling: {url}')
119
+ links = crawl(url, local_domain, prog)
120
+ if prog is not None: prog(1, desc=f'Crawling: {url}')
121
+ else:
122
+ links = set([url])
123
+ # separate pdf and other links
124
+ c_links, pdf_links = [], []
125
+ for x in links:
126
+ if x.endswith('.pdf'):
127
+ pdf_links.append(x)
128
+ elif not x.endswith(media_files):
129
+ c_links.append(x)
130
+
131
+ # Clean links loader using WebBaseLoader
132
+ if prog is not None: prog(0.5, desc=f'Ingesting: {url}')
133
+ if c_links:
134
+ loader = WebBaseLoader(list(c_links))
135
+ documents.extend(loader.load())
136
+
137
+ # remote PDFs loader
138
+ for pdf_link in list(pdf_links):
139
+ loader = PyMuPDFLoader(pdf_link)
140
+ doc = loader.load()
141
+ for x in doc:
142
+ x.metadata['source'] = loader.source
143
+ documents.extend(doc)
144
+
145
+ return documents
146
+
147
+ def ingestFiles(documents, files_list, prog=None):
148
+ for fPath in files_list:
149
+ doc = None
150
+ if fPath.endswith('.pdf'):
151
+ doc = PyMuPDFLoader(fPath).load()
152
+ elif fPath.endswith('.txt') and not 'WhatsApp Chat with' in fPath:
153
+ doc = TextLoader(fPath).load()
154
+ elif fPath.endswith(('.doc', 'docx')):
155
+ doc = Docx2txtLoader(fPath).load()
156
+ elif 'WhatsApp Chat with' in fPath and fPath.endswith('.csv'): # Convert Whatsapp TXT files to CSV using https://whatstk.streamlit.app/
157
+ doc = WhatsAppChatLoader(fPath).load()
158
+ else:
159
+ pass
160
+
161
+ if doc is not None and doc[0].page_content:
162
+ if prog is not None: prog(1, desc='Loaded file: '+fPath.rsplit('/')[0])
163
+ print('Loaded file:', fPath)
164
+ documents.extend(doc)
165
+ return documents
166
+
167
+
168
+ def data_ingestion(inputDir=None, file_list=[], url_list=[], prog=None):
169
+ documents = []
170
+ # Ingestion from Input Directory
171
+ if inputDir is not None:
172
+ files = [str(x) for x in Path(inputDir).glob('**/*')]
173
+ documents = ingestFiles(documents, files)
174
+ if file_list:
175
+ documents = ingestFiles(documents, file_list, prog)
176
+ # Ingestion from URLs - also try https://python.langchain.com/docs/integrations/document_loaders/recursive_url_loader
177
+ if url_list:
178
+ for url in url_list:
179
+ documents = ingestURL(documents, url, prog=prog)
180
+
181
+ # Cleanup documents
182
+ for x in documents:
183
+ if 'WhatsApp Chat with' not in x.metadata['source']:
184
+ x.page_content = x.page_content.strip().replace('\n', ' ').replace('\\n', ' ').replace(' ', ' ')
185
+
186
+ # print(f"Total number of documents: {len(documents)}")
187
+ return documents
188
+
189
+
190
+ def split_docs(documents):
191
+ # Splitting and Chunks
192
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=250) # default chunk size of 4000 makes around 1k tokens per doc. with k=4, this means 4k tokens input to LLM.
193
+ docs = text_splitter.split_documents(documents)
194
+ return docs
195
+
196
+
197
+ def getSourcesFromMetadata(metadata, sourceOnly=True, sepFileUrl=True):
198
+ # metadata: list of metadata dict from all documents
199
+ setSrc = set()
200
+ for x in metadata:
201
+ metadataText = '' # we need to convert each metadata dict into a string format. This string will be added to a set
202
+ if x is not None:
203
+ # extract source first, and then extract all other items
204
+ source = x['source']
205
+ source = source.rsplit('/',1)[-1] if 'http' not in source else source
206
+ notSource = []
207
+ for k,v in x.items():
208
+ if v is not None and k!='source' and k in ['page', 'title']:
209
+ notSource.extend([f"{k}: {v}"])
210
+ metadataText = ', '.join([f'source: {source}'] + notSource) if sourceOnly==False else source
211
+ setSrc.add(metadataText)
212
+
213
+ if sepFileUrl:
214
+ src_files = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted([x for x in setSrc if 'http' not in x], key=str.casefold))]))
215
+ src_urls = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted([x for x in setSrc if 'http' in x], key=str.casefold))]))
216
+
217
+ src_files = 'Files:\n'+src_files if src_files else ''
218
+ src_urls = 'URLs:\n'+src_urls if src_urls else ''
219
+ newLineSep = '\n\n' if src_files and src_urls else ''
220
+
221
+ return src_files + newLineSep + src_urls , len(setSrc)
222
+ else:
223
+ src_docs = '\n'.join(([f"{i+1}) {x}" for i,x in enumerate(sorted(list(setSrc), key=str.casefold))]))
224
+ return src_docs, len(setSrc)
225
+
226
+
227
+ def getVsDict(embeddingFunc, docs, vsDict={}):
228
+ # create chroma client if doesnt exist
229
+ if vsDict.get('chromaClient') is None:
230
+ vsDict['chromaDir'] = './vecstore/'+str(uuid.uuid1())
231
+ vsDict['chromaClient'] = Chroma(embedding_function=embeddingFunc, persist_directory=vsDict['chromaDir'])
232
+ # clear chroma client before adding new docs
233
+ if vsDict['chromaClient']._collection.count()>0:
234
+ vsDict['chromaClient'].delete(vsDict['chromaClient'].get()['ids'])
235
+ # add new docs to chroma client
236
+ vsDict['chromaClient'].add_documents(docs)
237
+ print('vectorstore count:',vsDict['chromaClient']._collection.count(), 'at', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
238
+ return vsDict
239
+
240
+ # used for Hardcoded documents only - not uploaded by user (userData_vecStore is separate function)
241
+ def localData_vecStore(openApiKey=None, inputDir=None, file_list=[], url_list=[], vsDict={}):
242
+ documents = data_ingestion(inputDir, file_list, url_list)
243
+ if not documents:
244
+ return {}
245
+ docs = split_docs(documents)
246
+ # Embeddings
247
+ embeddings = OpenAIEmbeddings(openai_api_key=openApiKey)
248
+ # create chroma client if doesnt exist
249
+ vsDict_hd = getVsDict(embeddings, docs, vsDict)
250
+ # get sources from metadata
251
+ src_str = getSourcesFromMetadata(vsDict_hd['chromaClient'].get()['metadatas'])
252
+ src_str = str(src_str[1]) + ' source document(s) successfully loaded in vector store.'+'\n\n' + src_str[0]
253
+ print(src_str)
254
+ return vsDict_hd
255
+
256
+
257
+ def num_tokens_from_string(string, encoding_name = "cl100k_base"):
258
+ """Returns the number of tokens in a text string."""
259
+ encoding = tiktoken.get_encoding(encoding_name)
260
+ num_tokens = len(encoding.encode(string))
261
+ return num_tokens