Spaces:

muradkhan
/

indemo

Paused

App Files Files Community

muradkhan commited on Jul 24, 2024

Commit

641b252

verified ·

1 Parent(s): 6b192a8

Create app.py

Browse files

Files changed (1) hide show

app.py +93 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import PyPDF2
+from pprint import pprint
+from haystack import Pipeline
+from haystack.schema import Document
+from haystack.nodes import BM25Retriever
+from haystack.document_stores import InMemoryDocumentStore
+from haystack.nodes import PreProcessor, PromptTemplate, PromptNode
+from pdf2image import convert_from_path
+import pytesseract
+from PIL import Image
+import gradio as gr
+import os
+# Function to extract text from a PDF file using OCR
+def extract_text_from_pdf(pdf_path):
+    text = ""
+    # Convert PDF pages to images
+    images = convert_from_path(pdf_path)
+    for image in images:
+        # Perform OCR on the image
+        text += pytesseract.image_to_string(image)
+    return text
+# Process and retrieve answers
+def process_invoice(pdf, hf_token, questions):
+    # Extract text from the PDF
+    extracted_text = extract_text_from_pdf(pdf.name)
+    document = Document(content=extracted_text)
+    docs = [document]
+    # Initializing the processor
+    processor = PreProcessor(
+        clean_empty_lines=True,
+        clean_whitespace=True,
+        clean_header_footer=True,
+        split_by="word",
+        split_length=500,
+        split_respect_sentence_boundary=True,
+        split_overlap=0,
+    )
+    preprocessed_docs = processor.process(docs)
+    document_store = InMemoryDocumentStore(use_bm25=True)
+    document_store.write_documents(preprocessed_docs)
+    retriever = BM25Retriever(document_store, top_k=2)
+    qa_template = PromptTemplate(prompt=
+        """ Using exclusively the information contained in the context, answer only the question asked without adding
+        suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the
+        context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice
+        respond: "Not sure because not relevant to the context.
+        Context: {join(documents)};
+        Question: {query}
+        """)
+    prompt_node = PromptNode(
+        model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1',
+        api_key=hf_token,
+        default_prompt_template=qa_template,
+        max_length=500,
+        model_kwargs={"model_max_length": 5000}
+    )
+    rag_pipeline = Pipeline()
+    rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
+    rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])
+    answers = {}
+    for question in questions.split(','):
+        result = rag_pipeline.run(query=question.strip())
+        answers[question] = result["results"][0].strip()
+    return answers
+# Gradio interface
+def gradio_interface(pdf, hf_token, questions):
+    answers = process_invoice(pdf, hf_token, questions)
+    return answers
+interface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[
+        gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"),
+        gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"),
+        gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas")
+    ],
+    outputs="json",
+    title="Invoice Data Extraction",
+    description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions."
+)
+if __name__ == "__main__":
+    interface.launch()