NicolasGaudemet commited on
Commit
3fd3fa4
·
1 Parent(s): dc9ded5

Update document_questioner_app.py

Browse files
Files changed (1) hide show
  1. document_questioner_app.py +73 -31
document_questioner_app.py CHANGED
@@ -6,11 +6,12 @@ from langchain.document_loaders import PyPDFLoader
6
  from langchain.embeddings.openai import OpenAIEmbeddings
7
  from langchain.vectorstores import Chroma
8
  from langchain.indexes import VectorstoreIndexCreator
9
- from langchain.chains import RetrievalQAWithSourcesChain
10
  from langchain.prompts import PromptTemplate
11
  from langchain.chat_models import ChatOpenAI
 
12
 
13
- def question_document(Document, Question):
14
 
15
  # loads a PDF document
16
  if not Document:
@@ -20,48 +21,89 @@ def question_document(Document, Question):
20
 
21
  loader = PyPDFLoader(Document.name)
22
  docs = loader.load()
 
 
23
 
24
  # Create embeddings
25
  embeddings = OpenAIEmbeddings(openai_api_key = os.environ['OpenaiKey'])
26
 
27
  # Write in DB
28
- docsearch = Chroma.from_documents(docs, embeddings, ids=["page" + str(d.metadata["page"]) for d in docs])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Define LLM
31
- llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2, openai_api_key = os.environ['OpenaiKey'])
 
32
 
33
  # Customize map_reduce prompts
34
- question_template = """{context}
35
- Precise the number starting the above text in your answer. It corresponds to its page number in the document it is from. Label this number as "page".
36
- Also make sure to answer in the same langage than the following question.
37
- QUESTION : {question}
38
- ANSWER :
39
- """
40
 
41
- combine_template = """{summaries}
42
- Note that the above text is based on transient extracts from one source document.
43
- So make sure to not mention different documents or extracts or passages or portions or texts. There is only one, entire document.
44
- Also make sure to answer in the same langage than the following question.
45
- QUESTION : {question}.
46
- ANSWER :
47
- """
48
 
49
- question_prompt = PromptTemplate(template = question_template, input_variables=['context', 'question'])
50
- combine_prompt = PromptTemplate(template = combine_template, input_variables=['summaries', 'question'])
51
 
52
  # Define chain
53
- chain_type_kwargs = { "combine_prompt" : combine_prompt, "question_prompt" : question_prompt} #, "return_intermediate_steps" : True}
54
- qa = RetrievalQAWithSourcesChain.from_chain_type(llm = llm, chain_type = "map_reduce", chain_type_kwargs = chain_type_kwargs, retriever=docsearch.as_retriever(), return_source_documents = True)
55
 
56
- answer = qa({"question" : Question}, return_only_outputs = True)
57
- return answer["answer"]
 
 
 
 
 
 
 
58
 
59
- iface = gr.Interface(
60
- fn = question_document,
61
- inputs= ["file","text"],
62
- outputs = gr.Textbox(label="Réponse"),
63
- title="Interrogateur de PDF",
64
- description="par Nicolas \nPermet d'interroger un document PDF",
65
- allow_flagging = "never")
 
 
 
 
 
 
 
 
66
 
67
- iface.launch()
 
 
 
 
 
 
 
 
6
  from langchain.embeddings.openai import OpenAIEmbeddings
7
  from langchain.vectorstores import Chroma
8
  from langchain.indexes import VectorstoreIndexCreator
9
+ from langchain.chains import ConversationalRetrievalChain
10
  from langchain.prompts import PromptTemplate
11
  from langchain.chat_models import ChatOpenAI
12
+ from langchain.llms import OpenAI
13
 
14
+ def load_document(Document):
15
 
16
  # loads a PDF document
17
  if not Document:
 
21
 
22
  loader = PyPDFLoader(Document.name)
23
  docs = loader.load()
24
+ global k
25
+ k = len(docs)
26
 
27
  # Create embeddings
28
  embeddings = OpenAIEmbeddings(openai_api_key = os.environ['OpenaiKey'])
29
 
30
  # Write in DB
31
+ global docsearch
32
+ docsearch = Chroma.from_documents(docs, embeddings, ids=["page" + str(d.metadata["page"]) for d in docs], k=1)
33
+ global chat_history
34
+ chat_history = []
35
+
36
+ return "Endodage créé"
37
+
38
+ def get_chat_history(inputs) -> str:
39
+ res = []
40
+ for human, ai in inputs:
41
+ res.append(f"Question : {human}\nRéponse : {ai}")
42
+ return "\n".join(res)
43
+
44
+ def question_document(Question):
45
+
46
+ if "docsearch" not in globals():
47
+ return "Merci d'encoder un document PDF"
48
 
49
  # Define LLM
50
+ turbo = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key = os.environ['OpenaiKey'])
51
+ davinci = OpenAI(model_name = "text-davinci-003", openai_api_key = os.environ['OpenaiKey'])
52
 
53
  # Customize map_reduce prompts
54
+ #question_template = """{context}
55
+ #Precise the number starting the above text in your answer. It corresponds to its page number in the document it is from. Label this number as "page".
56
+ #Also make sure to answer in the same langage than the following question.
57
+ #QUESTION : {question}
58
+ #ANSWER :
59
+ #"""
60
 
61
+ #combine_template = """{summaries}
62
+ #Note that the above text is based on transient extracts from one source document.
63
+ #So make sure to not mention different documents or extracts or passages or portions or texts. There is only one, entire document.
64
+ #Also make sure to answer in the same langage than the following question.
65
+ #QUESTION : {question}.
66
+ #ANSWER :
67
+ #"""
68
 
69
+ #question_prompt = PromptTemplate(template = question_template, input_variables=['context', 'question'])
70
+ #combine_prompt = PromptTemplate(template = combine_template, input_variables=['summaries', 'question'])
71
 
72
  # Define chain
73
+ #chain_type_kwargs = { "combine_prompt" : combine_prompt, "question_prompt" : question_prompt} #, "return_intermediate_steps" : True}
74
+ #qa = RetrievalQAWithSourcesChain.from_chain_type(llm = llm, chain_type = "map_reduce", chain_type_kwargs = chain_type_kwargs, retriever=docsearch.as_retriever(), return_source_documents = True)
75
 
76
+ vectordbkwargs = {"search_distance": 10}
77
+ search_kwargs={"k" : k}
78
+
79
+ qa = ConversationalRetrievalChain.from_llm(llm = turbo, chain_type = "map_reduce",retriever=docsearch.as_retriever(search_kwargs = search_kwargs), get_chat_history = get_chat_history, return_source_documents = True)
80
+ answer = qa({"question" : Question,"chat_history":chat_history, "vectordbkwargs": vectordbkwargs}, return_only_outputs = True)
81
+ chat_history.append((Question, answer["answer"]))
82
+ #answer = qa({"question" : Question}, )
83
+ print(answer)
84
+ return "".join(get_chat_history(chat_history))
85
 
86
+ with gr.Blocks() as demo:
87
+
88
+ gr.Markdown(
89
+ """
90
+ # Interrogateur de PDF
91
+ par Nicolas et Alex
92
+ """)
93
+
94
+ with gr.Row():
95
+
96
+ with gr.Column():
97
+ input_file = gr.inputs.File(label="Charger un document")
98
+ greet_btnee = gr.Button("Encoder le document")
99
+ output_words = gr.outputs.Textbox(label="Encodage")
100
+ greet_btnee.click(fn=load_document, inputs=input_file, outputs = output_words)
101
 
102
+ with gr.Column():
103
+ text = gr.inputs.Textbox(label="Question")
104
+ greet_btn = gr.Button("Poser une question")
105
+ answer = gr.Textbox(label = "Réponse", lines = 8)
106
+ greet_btn.click(fn = question_document, inputs = text, outputs = answer)
107
+
108
+
109
+ demo.launch()