accounting-micro-automation / pdfparser_hq.py
Adr740's picture
Upload 8 files
411ca77 verified
raw
history blame
3.49 kB
from openai import OpenAI
import pdf2image
import base64
import os
import time
from config import openai_api
client = OpenAI(api_key=openai_api)
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def pdf_to_image(path_to_pdf, get_output_in_code = False):
paths_to_img = []
print("Converting pdf to img")
start_time = time.time()
images = pdf2image.convert_from_path(path_to_pdf, dpi=100)
end_time = time.time()
execution_time = end_time - start_time
print("Conversion complete")
print("Execution time: {:.2f} seconds".format(execution_time))
os.makedirs(path_to_pdf.replace(".pdf", ""), exist_ok=True)
save_path = path_to_pdf.replace(".pdf", "") + "/png/"
print("Creating repository to store images")
os.makedirs(save_path, exist_ok=True)
print("Directory created : ", save_path)
for i, image in enumerate(images):
print(f"saving page {i} in {save_path}/{i}_page.png")
image.save(f"{save_path}{i}_page.png", "PNG")
paths_to_img.append(f"{save_path}/{i}_page.png")
if get_output_in_code:
return images, paths_to_img
def pdfs_folder_to_images(input_path, get_output_in_code = False):
pdf_files = []
images = {}
for root, dirs, files in os.walk(input_path):
for file in files:
if file.endswith('.pdf'):
print("FILE IS ", os.path.join(root, file))
pdf_files.append(os.path.join(root, file))
if get_output_in_code:
images[os.path.join(root, file)] = pdf_to_image(os.path.join(root, file), get_output_in_code=True)
else:
pdf_to_image(os.path.join(root, file))
if get_output_in_code:
return images
def img_to_txt(img):
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": "Your task is to transcribe and explain in English every single thing from screenshots sent by users"
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{img}",
}
}
]
}
],
temperature=1,
max_tokens=1999,
top_p=1,
frequency_penalty=0,
presence_penalty=0
).choices[0].message.content
return response
def img_to_txt_gemini(img):
return ""
def process_pdf_hq(path, get_output_in_code=True):
converted_pdf_router = pdfs_folder_to_images(path, get_output_in_code=True)
path_extracted_pdf = path+"/extracted_pdf/"
os.makedirs(path_extracted_pdf, exist_ok=True)
# paths_to_img
content_extracted = {}
for link in list(converted_pdf_router.keys()):
print("Working on ", link)
content_extracted[link] = []
for img_path in converted_pdf_router[link][1]:
print("Processing subimage")
base64_image = encode_image(img_path)
content = img_to_txt(base64_image)
# content = "Blank"
print(img_path)
content_extracted[link].append(content)
with open(f"{path_extracted_pdf}/PDF_FILE_{img_path.replace('/','_').replace('.','_')}.txt", "w") as fil:
fil.write(content)
if get_output_in_code:
return content_extracted