AI4PE-pre-alpha / pdfparser_performance.py
Adr740's picture
Update pdfparser_performance.py
9d554d5 verified
import subprocess
import os
import shutil
def rename_and_move_files(root_folder):
for foldername, subfolders, filenames in os.walk(root_folder):
for filename in filenames:
if filename.endswith('.md'):
source_path = os.path.join(foldername, filename)
dest_path = os.path.join(root_folder, filename.replace('.md', '_PDF_FILE.txt'))
os.rename(source_path, dest_path)
shutil.rmtree(foldername)
def process_pdf_performance(path):
path_extracted_pdf = path+"/extracted_pdf/"
os.makedirs(path_extracted_pdf, exist_ok=True)
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith('.pdf'):
print("FILE IS ", os.path.join(root, file))
file_proper_format = file.replace(" ", "\ ")
os.system(f"marker_single {os.path.join(root, file_proper_format)} {path_extracted_pdf} --batch_multiplier 10")
rename_and_move_files(path_extracted_pdf)