|
from pprint import pprint |
|
from PyPDF2 import PdfReader |
|
import gradio as gr |
|
from transformers import pipeline |
|
import os |
|
|
|
|
|
def read_pdf(pdf_path): |
|
content = "" |
|
reader = PdfReader(pdf_path) |
|
for page in reader.pages: |
|
content += page.extract_text() |
|
return content |
|
|
|
|
|
def process_invoice(file, questions): |
|
try: |
|
|
|
print("Reading PDF content...") |
|
pdf_content = read_pdf(file.name) |
|
print(f"PDF Content: {pdf_content[:500]}...") |
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
if not hf_token: |
|
raise ValueError("Hugging Face token not found in environment variables.") |
|
|
|
|
|
print("Initializing the Hugging Face pipeline...") |
|
qa_pipeline = pipeline("question-answering", model="mistralai/Mistral-7B-Instruct-v0.2", use_auth_token=hf_token) |
|
|
|
answers = {} |
|
for question in questions.split(','): |
|
print(f"Asking question: {question.strip()}") |
|
result = qa_pipeline(question=question.strip(), context=pdf_content) |
|
answers[question] = result['answer'] |
|
print(f"Answer: {result['answer']}") |
|
|
|
return answers |
|
except Exception as e: |
|
print(f"Error: {e}") |
|
return {"error": str(e)} |
|
|
|
|
|
def gradio_interface(file, questions): |
|
answers = process_invoice(file, questions) |
|
return answers |
|
|
|
interface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[ |
|
gr.File(file_count="single", label="Upload Invoice (PDF)"), |
|
gr.Textbox(lines=5, placeholder="Enter your questions separated by commas") |
|
], |
|
outputs="json", |
|
title="Invoice Data Extraction", |
|
description="Upload an invoice PDF and get the extracted data based on your questions." |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch() |
|
|