File size: 3,134 Bytes
b3a99ca
 
988c7cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e857a97
988c7cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87908a4
988c7cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6863a01
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import google.generativeai as palm
import pandas as pd

import io
from flask import Flask, request
from twilio.twiml.messaging_response import MessagingResponse
from langchain.llms import GooglePalm
import pandas as pd
#from yolopandas import pd
import os
from langchain.embeddings import GooglePalmEmbeddings
# a class to create a question answering system based on information retrieval
from langchain.chains import RetrievalQA
# a class for splitting text into fixed-sized chunks with an optional overlay
from langchain.text_splitter import RecursiveCharacterTextSplitter
# a class to create a vector index using FAISS, a library for approximate nearest neighbor search
from langchain.vectorstores import FAISS
# a class for loading PDF documents from a directory
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import ConversationalRetrievalChain
from langchain.schema.vectorstore import VectorStoreRetriever
import google.generativeai

from dotenv import load_dotenv

load_dotenv()




def get_pdf_text(pdf_docs):
    text=""
    for pdf in pdf_docs:
        pdf_reader= PdfReader(pdf)
        for page in pdf_reader.pages:
            text+= page.extract_text()
    return  text

# load PDF files from a directory
loader = PyPDFDirectoryLoader("documents/")
data = loader.load()

# print the loaded data, which is a list of tuples (file name, text extracted from the PDF)
#print(data)

# split the extracted data into text chunks using the text_splitter, which splits the text based on the specified number of characters and overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=20)

text_chunks = text_splitter.split_documents(data)

# print the number of chunks obtained
#print(len(text_chunks))

embeddings = GooglePalmEmbeddings(google_api_key=os.environ['PALM'])

# create embeddings for each text chunk using the FAISS class, which creates a vector index using FAISS and allows efficient searches between vectors
vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)

#print(type(vector_store))



def ask_pdfs(user_question):
    load_dotenv()



    llm = GooglePalm(temperature=0, google_api_key=os.environ['PALM'])

    # Create a question answering system based on information retrieval using the RetrievalQA class, which takes as input a neural language model, a chain type and a retriever (an object that allows you to retrieve the most relevant chunks of text for a query)
    retriever = VectorStoreRetriever(vectorstore=vector_store)
    qa = RetrievalQA.from_llm(llm=llm, retriever=retriever)
    response =qa.run(user_question)
    #print("Response:",response)

    return response

app = Flask(__name__)

@app.route("/", methods=["POST"])
def whatsapp():

    # user input
    user_msg = request.values.get('Body', '').lower()

    # creating object of MessagingResponse
    response = MessagingResponse()

    # User Query
    q = user_msg

    response = ask_pdfs(q)

    return str(response)


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)