barghavani commited on
Commit
80a01f0
·
verified ·
1 Parent(s): 54ebddd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -72
app.py CHANGED
@@ -1,78 +1,31 @@
1
- import streamlit as st
2
- import os
3
- from PyPDF2 import PdfReader
4
- from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings.openai import OpenAIEmbeddings
6
- from langchain.vectorstores import FAISS
7
- from langchain.chains.question_answering import load_qa_chain
8
- from langchain.callbacks import get_openai_callback
9
- from langchain import HuggingFaceHub, LLMChain
10
- from langchain.embeddings import HuggingFaceHubEmbeddings,HuggingFaceInferenceAPIEmbeddings
11
- token = os.environ['HF_TOKEN']
12
- repo_id = "sentence-transformers/all-mpnet-base-v2"
13
- hf = HuggingFaceHubEmbeddings(
14
- repo_id=repo_id,
15
- task="feature-extraction",
16
- huggingfacehub_api_token= token,
17
- )
18
 
19
- from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
 
 
20
 
21
- embeddings = HuggingFaceInferenceAPIEmbeddings(
22
- api_key=token, model_name="sentence-transformers/all-MiniLM-l6-v2"
23
- )
24
 
 
25
 
26
- def main():
27
-
28
- st.set_page_config(page_title="Ask your PDF")
29
- st.header("Ask your PDF 💬")
30
-
31
- # upload file
32
- pdf = st.file_uploader("Upload your PDF", type="pdf")
33
-
34
- # extract the text
35
- if pdf is not None:
36
- pdf_reader = PdfReader(pdf)
37
- text = ""
38
- for page in pdf_reader.pages:
39
- text += page.extract_text()
40
-
41
- # split into chunks
42
- text_splitter = CharacterTextSplitter(
43
- separator="\n",
44
- chunk_size=1000,
45
- chunk_overlap=200,
46
- length_function=len
47
- )
48
- chunks = text_splitter.split_text(text)
49
-
50
- # create embeddings
51
- # embeddings = OpenAIEmbeddings()
52
- # embeddings = query(chunks)
53
- # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
54
-
55
- knowledge_base = FAISS.from_texts(chunks, embeddings)
56
-
57
- # show user input
58
- user_question = st.text_input("Ask a question about your PDF:")
59
- if user_question:
60
- docs = knowledge_base.similarity_search(user_question)
61
-
62
- # llm = OpenAI()
63
 
64
- hub_llm = HuggingFaceHub(
65
- repo_id='HuggingFaceH4/zephyr-7b-beta',
66
- model_kwargs={'temperature':0.01,"max_length": 2048,},
67
- huggingfacehub_api_token=token)
68
- llm = hub_llm
69
- chain = load_qa_chain(llm, chain_type="stuff")
70
- with get_openai_callback() as cb:
71
- response = chain.run(input_documents=docs, question=user_question)
72
- print(cb)
73
-
74
- st.write(response)
75
-
76
 
77
- if __name__ == '__main__':
78
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Union
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ from pypdf import PdfReader
5
+ from transformers import pipeline
6
+ import gradio as gr
7
 
 
 
 
8
 
9
+ question_answerer = pipeline(task="question-answering", model="deepset/tinyroberta-squad2")
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ def get_text_from_pdf(pdf_file: Union[str, Path]) -> str:
13
+ """Read the PDF from the given path and return a string with its entire content."""
14
+ reader = PdfReader(pdf_file)
 
 
 
 
 
 
 
 
 
15
 
16
+ # Extract text from all pages
17
+ full_text = ""
18
+ for page in reader.pages:
19
+ full_text += page.extract_text()
20
+ return full_text
21
+
22
+
23
+ def answer_doc_question(pdf_file, question):
24
+ pdf_text = get_text_from_pdf(pdf_file)
25
+ answer = question_answerer(question, pdf_text)
26
+ return answer["answer"]
27
+
28
+
29
+ pdf_input = gr.File(file_types=[".pdf"], label="Upload a PDF document and ask a question about it.")
30
+ question = gr.Textbox(label="Type a question regarding the uploaded document here.")
31
+ gr.Interface(fn=answer_doc_question, inputs=[pdf_input, question], outputs="text").launch()