File size: 15,498 Bytes
9124976
bed03be
9124976
 
bed03be
 
2614912
bed03be
 
 
 
2614912
bed03be
 
9124976
 
 
 
 
 
 
 
 
 
 
 
 
 
2614912
9124976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2614912
9124976
 
 
 
 
 
 
 
 
 
 
 
 
 
2614912
9124976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2614912
9124976
 
 
 
 
bed03be
2614912
9124976
 
 
 
 
 
 
 
2614912
9124976
 
 
 
 
 
 
7e34cb9
 
9124976
7e34cb9
 
 
 
 
 
 
 
 
 
9124976
 
7e34cb9
 
9124976
 
 
 
7e34cb9
9124976
 
 
 
 
 
 
 
 
 
 
bed03be
9124976
2614912
bed03be
9124976
2614912
bed03be
9124976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bed03be
9124976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fe3ba2
112de32
68aad16
8fe3ba2
9124976
68aad16
 
 
 
 
9124976
4c8df35
68aad16
9124976
 
 
 
 
 
 
 
 
 
68aad16
9124976
 
4c8df35
 
 
 
 
 
9124976
4c8df35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68aad16
 
9124976
 
4c8df35
9124976
 
68aad16
9124976
 
4c8df35
 
 
 
 
 
9124976
4c8df35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68aad16
 
 
 
9124976
4c8df35
112de32
4c8df35
9124976
8fe3ba2
9124976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
import os
import re
from pathlib import Path

import chromadb
import gradio as gr
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.vectorstores import Chroma
from unidecode import unidecode

list_llm = [
    "mistralai/Mistral-7B-Instruct-v0.2",
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "mistralai/Mistral-7B-Instruct-v0.1",
    "google/gemma-7b-it",
    "google/gemma-2b-it",
    "HuggingFaceH4/zephyr-7b-beta",
    "tiiuae/falcon-7b-instruct",
    "google/flan-t5-xxl",
]
list_llm_simple = [os.path.basename(llm) for llm in list_llm]


def load_doc_and_create_splits(list_file_path, chunk_size, chunk_overlap):
    # Processing for one document only
    # loader = PyPDFLoader(file_path)
    # pages = loader.load()
    loaders = [PyPDFLoader(x) for x in list_file_path]
    pages = []
    for loader in loaders:
        pages.extend(loader.load())
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size = 600, chunk_overlap = 50)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    doc_splits = text_splitter.split_documents(pages)
    return doc_splits


def create_vector_db(splits, collection_name):
    embedding = HuggingFaceEmbeddings()
    new_client = chromadb.EphemeralClient()
    vectordb = Chroma.from_documents(
        documents=splits,
        embedding=embedding,
        client=new_client,
        collection_name=collection_name,
    )
    return vectordb


def initialize_llmchain(
    llm_model, temperature, max_tokens, top_k, vector_db, progress=gr.Progress()
):
    progress(0.1, desc="Initializing HF Hub...")
    if llm_model == "mistralai/Mixtral-8x7B-Instruct-v0.1":
        llm = HuggingFaceEndpoint(
            repo_id=llm_model,
            temperature=temperature,
            max_new_tokens=max_tokens,
            top_k=top_k,
            load_in_8bit=True,
        )
    else:
        llm = HuggingFaceEndpoint(
            repo_id=llm_model,
            temperature=temperature,
            max_new_tokens=max_tokens,
            top_k=top_k,
        )

    progress(0.6, desc="Defining buffer memory...")
    memory = ConversationBufferMemory(
        memory_key="chat_history", output_key="answer", return_messages=True
    )
    # retriever=vector_db.as_retriever(search_type="similarity", search_kwargs={'k': 3})
    retriever = vector_db.as_retriever()

    progress(0.75, desc="Defining retrieval chain...")
    qa_chain = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=retriever,
        chain_type="stuff",
        memory=memory,
        return_source_documents=True,
        verbose=False,
    )

    progress(0.9, desc="Done!")
    return qa_chain


# Generate collection name for vector database
#  - Use filepath as input, ensuring unicode text
def create_collection_name(filepath):
    collection_name = Path(filepath).stem  # Extract filename without extension

    # Fix potential issues from naming convention
    collection_name = collection_name.replace(" ", "-")  # Remove space
    collection_name = unidecode(
        collection_name
    )  # ASCII transliterations of Unicode text
    collection_name = re.sub(
        "[^A-Za-z0-9]+", "-", collection_name
    )  # Remove special characters
    collection_name = collection_name[:50]  # Limit length to 50 characters

    # Minimum length of 3 characters
    if len(collection_name) < 3:
        collection_name = collection_name + "xyz"

    # Enforce start and end as alphanumeric character
    if not collection_name[0].isalnum():
        collection_name = "A" + collection_name[1:]
    if not collection_name[-1].isalnum():
        collection_name = collection_name[:-1] + "Z"

    print("Filepath: ", filepath)
    print("Collection name: ", collection_name)
    return collection_name


def initialize_database(
    list_file_obj, chunk_size, chunk_overlap, progress=gr.Progress()
):
    list_file_path = [x.name for x in list_file_obj if x is not None]
    progress(0.1, desc="Creating collection name...")
    collection_name = create_collection_name(list_file_path[0])

    progress(0.25, desc="Loading document...")
    doc_splits = load_doc_and_create_splits(list_file_path, chunk_size, chunk_overlap)

    progress(0.5, desc="Generating vector database...")
    vector_db = create_vector_db(doc_splits, collection_name)

    progress(0.9, desc="Done!")
    return vector_db, collection_name, "Complete!"


def initialize_LLM(
    llm_option, llm_temperature, max_tokens, top_k, vector_db, progress=gr.Progress()
):
    llm_name = list_llm[llm_option]
    print("llm_name: ", llm_name)
    qa_chain = initialize_llmchain(
        llm_name, llm_temperature, max_tokens, top_k, vector_db, progress
    )
    return qa_chain, "Complete!"


def format_chat_history(message, chat_history):
    formatted_chat_history = []
    for user_message, bot_message in chat_history:
        formatted_chat_history.append(f"User: {user_message}")
        formatted_chat_history.append(f"Assistant: {bot_message}")
    return formatted_chat_history


def conversation(qa_chain, message, history):
    formatted_chat_history = format_chat_history(message, history)

    # Generate response using QA chain
    response = qa_chain({"question": message, "chat_history": formatted_chat_history})
    response_answer = response["answer"]
    if response_answer.find("Helpful Answer:") != -1:
        response_answer = response_answer.split("Helpful Answer:")[-1]
    response_sources = response["source_documents"]
    # Langchain sources are zero-based
    response_source1 = response_sources[0].page_content.strip()
    response_source2 = response_sources[1].page_content.strip()
    response_source3 = response_sources[2].page_content.strip()
    response_source1_page = response_sources[0].metadata["page"] + 1
    response_source2_page = response_sources[1].metadata["page"] + 1
    response_source3_page = response_sources[2].metadata["page"] + 1

    # Append user message and response to chat history
    new_history = history + [(message, response_answer)]
    # return gr.update(value=""), new_history, response_sources[0], response_sources[1]
    return (
        qa_chain,
        gr.update(value=""),
        new_history,
        response_source1,
        response_source1_page,
        response_source2,
        response_source2_page,
        response_source3,
        response_source3_page,
    )


def upload_file(file_obj):
    list_file_path = []
    for idx, file in enumerate(file_obj):
        file_path = file_obj.name
        list_file_path.append(file_path)
    return list_file_path


def demo():
    with gr.Blocks(theme="base") as demo:
        vector_db = gr.State()
        qa_chain = gr.State()
        collection_name = gr.State()

        gr.Markdown(
            """
            <center><h1>Chat with your PDF</center></h1>
            <center><h3>Ask any questions about your PDF documents</h3><center>
            """
        )
        # gr.Markdown(
        #    """<b>Note:</b> This AI assistant, using Langchain and open-source LLMs, performs retrieval-augmented generation (RAG) from your PDF documents. \
        #    This chatbot takes past questions into account when generating answers (via conversational memory), and includes document references for clarity purposes.<br>
        #    """
        # )

        with gr.Tab("Chatbot configuration"):
            gr.Markdown("1. Upload the PDF(s)")
            with gr.Row():
                document = gr.Files(
                    height=100,
                    file_count="multiple",
                    file_types=["pdf"],
                    interactive=True,
                    label="Upload your PDF documents (single or multiple)",
                )
                # upload_btn = gr.UploadButton("Loading document...", height=100, file_count="multiple", file_types=["pdf"], scale=1)

            gr.Markdown("2. Configure the vector database")
            with gr.Row():
                with gr.Row():
                    db_btn = gr.Radio(
                        ["ChromaDB"],
                        label="Vector database type",
                        value="ChromaDB",
                        type="index",
                        info="Choose your vector database",
                    )
                with gr.Accordion(
                    "Advanced options - Document text splitter", open=False
                ):
                    with gr.Row():
                        slider_chunk_size = gr.Slider(
                            minimum=100,
                            maximum=1000,
                            value=600,
                            step=20,
                            label="Chunk size",
                            info="Chunk size",
                            interactive=True,
                        )
                    with gr.Row():
                        slider_chunk_overlap = gr.Slider(
                            minimum=10,
                            maximum=200,
                            value=40,
                            step=10,
                            label="Chunk overlap",
                            info="Chunk overlap",
                            interactive=True,
                        )
            with gr.Row():
                db_btn = gr.Button("Generate vector database", size="sm")
            with gr.Row():
                db_progress = gr.Textbox(
                    label="Vector database initialization", value="0% Configure the DB"
                )

            gr.Markdown("3. Configure the LLM model")
            with gr.Row():
                with gr.Row():
                    llm_btn = gr.Radio(
                        list_llm_simple,
                        label="LLM models",
                        value=list_llm_simple[0],
                        type="index",
                        info="Choose your LLM model",
                    )
                with gr.Accordion("Advanced options - LLM model", open=False):
                    with gr.Row():
                        slider_temperature = gr.Slider(
                            minimum=0.01,
                            maximum=1.0,
                            value=0.7,
                            step=0.1,
                            label="Temperature",
                            info="Model temperature",
                            interactive=True,
                        )
                    with gr.Row():
                        slider_maxtokens = gr.Slider(
                            minimum=224,
                            maximum=4096,
                            value=1024,
                            step=32,
                            label="Max Tokens",
                            info="Model max tokens",
                            interactive=True,
                        )
                    with gr.Row():
                        slider_topk = gr.Slider(
                            minimum=1,
                            maximum=10,
                            value=3,
                            step=1,
                            label="top-k samples",
                            info="Model top-k samples",
                            interactive=True,
                        )
            with gr.Row():
                qachain_btn = gr.Button(
                    "Initialize Question Answering chain", size="sm"
                )
            with gr.Row():
                llm_progress = gr.Textbox(
                    label="QA chain initialization", value="0% Configure the QA chain"
                )

        with gr.Tab("Chatbot"):
            chatbot = gr.Chatbot(height=300)
            with gr.Accordion("Advanced - Document references", open=False):
                with gr.Row():
                    doc_source1 = gr.Textbox(
                        label="Reference 1", lines=2, container=True, scale=20
                    )
                    source1_page = gr.Number(label="Page", scale=1)
                with gr.Row():
                    doc_source2 = gr.Textbox(
                        label="Reference 2", lines=2, container=True, scale=20
                    )
                    source2_page = gr.Number(label="Page", scale=1)
                with gr.Row():
                    doc_source3 = gr.Textbox(
                        label="Reference 3", lines=2, container=True, scale=20
                    )
                    source3_page = gr.Number(label="Page", scale=1)
            with gr.Row():
                msg = gr.Textbox(
                    placeholder="Type message (e.g. 'What is this document about?')",
                    container=True,
                )
            with gr.Row():
                submit_btn = gr.Button("Submit message")
                clear_btn = gr.ClearButton([msg, chatbot], value="Clear conversation")

        # Preprocessing events
        # upload_btn.upload(upload_file, inputs=[upload_btn], outputs=[document])
        db_btn.click(
            initialize_database,
            inputs=[document, slider_chunk_size, slider_chunk_overlap],
            outputs=[vector_db, collection_name, db_progress],
        )
        qachain_btn.click(
            initialize_LLM,
            inputs=[
                llm_btn,
                slider_temperature,
                slider_maxtokens,
                slider_topk,
                vector_db,
            ],
            outputs=[qa_chain, llm_progress],
        ).then(
            lambda: [None, "", 0, "", 0, "", 0],
            inputs=None,
            outputs=[
                chatbot,
                doc_source1,
                source1_page,
                doc_source2,
                source2_page,
                doc_source3,
                source3_page,
            ],
            queue=False,
        )

        # Chatbot events
        msg.submit(
            conversation,
            inputs=[qa_chain, msg, chatbot],
            outputs=[
                qa_chain,
                msg,
                chatbot,
                doc_source1,
                source1_page,
                doc_source2,
                source2_page,
                doc_source3,
                source3_page,
            ],
            queue=False,
        )
        submit_btn.click(
            conversation,
            inputs=[qa_chain, msg, chatbot],
            outputs=[
                qa_chain,
                msg,
                chatbot,
                doc_source1,
                source1_page,
                doc_source2,
                source2_page,
                doc_source3,
                source3_page,
            ],
            queue=False,
        )
        clear_btn.click(
            lambda: [None, "", 0, "", 0, "", 0],
            inputs=None,
            outputs=[
                chatbot,
                doc_source1,
                source1_page,
                doc_source2,
                source2_page,
                doc_source3,
                source3_page,
            ],
            queue=False,
        )
    demo.queue().launch(debug=True)


if __name__ == "__main__":
    demo()