|
import PyPDF2 |
|
from pprint import pprint |
|
from haystack import Pipeline |
|
from haystack.schema import Document |
|
from haystack.nodes import BM25Retriever |
|
from haystack.document_stores import InMemoryDocumentStore |
|
from haystack.nodes import PreProcessor, PromptTemplate, PromptNode |
|
from pdf2image import convert_from_path |
|
import pytesseract |
|
from PIL import Image |
|
import gradio as gr |
|
import os |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
text = "" |
|
|
|
images = convert_from_path(pdf_path) |
|
for image in images: |
|
|
|
text += pytesseract.image_to_string(image) |
|
return text |
|
|
|
|
|
def process_invoice(pdf, hf_token, questions): |
|
|
|
extracted_text = extract_text_from_pdf(pdf.name) |
|
document = Document(content=extracted_text) |
|
docs = [document] |
|
|
|
|
|
processor = PreProcessor( |
|
clean_empty_lines=True, |
|
clean_whitespace=True, |
|
clean_header_footer=True, |
|
split_by="word", |
|
split_length=500, |
|
split_respect_sentence_boundary=True, |
|
split_overlap=0, |
|
) |
|
|
|
preprocessed_docs = processor.process(docs) |
|
document_store = InMemoryDocumentStore(use_bm25=True) |
|
document_store.write_documents(preprocessed_docs) |
|
retriever = BM25Retriever(document_store, top_k=2) |
|
|
|
qa_template = PromptTemplate(prompt= |
|
""" Using exclusively the information contained in the context, answer only the question asked without adding |
|
suggestions for possible questions, and respond exclusively in English. If the answer cannot be deduced from the |
|
context, Don't add anything from the references if it is not asked explicitly. Do not repeat the same information twice |
|
respond: "Not sure because not relevant to the context. |
|
Context: {join(documents)}; |
|
Question: {query} |
|
""") |
|
|
|
prompt_node = PromptNode( |
|
model_name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1', |
|
api_key=hf_token, |
|
default_prompt_template=qa_template, |
|
max_length=500, |
|
model_kwargs={"model_max_length": 5000} |
|
) |
|
|
|
rag_pipeline = Pipeline() |
|
rag_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"]) |
|
rag_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"]) |
|
|
|
answers = {} |
|
for question in questions.split(','): |
|
result = rag_pipeline.run(query=question.strip()) |
|
answers[question] = result["results"][0].strip() |
|
|
|
return answers |
|
|
|
|
|
def gradio_interface(pdf, hf_token, questions): |
|
answers = process_invoice(pdf, hf_token, questions) |
|
return answers |
|
|
|
interface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[ |
|
gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"), |
|
gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"), |
|
gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas") |
|
], |
|
outputs="json", |
|
title="Invoice Data Extraction", |
|
description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions." |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch() |
|
|