import os os.system('git clone https://github.com/facebookresearch/detectron2.git') os.system('pip install -e detectron2') os.system("git clone https://github.com/microsoft/unilm.git") os.system("sed -i 's/from collections import Iterable/from collections.abc import Iterable/' unilm/dit/object_detection/ditod/table_evaluation/data_structure.py") os.system("curl -LJ -o publaynet_dit-b_cascade.pth 'https://layoutlm.blob.core.windows.net/dit/dit-fts/publaynet_dit-b_cascade.pth?sv=2022-11-02&ss=b&srt=o&sp=r&se=2033-06-08T16:48:15Z&st=2023-06-08T08:48:15Z&spr=https&sig=a9VXrihTzbWyVfaIDlIT1Z0FoR1073VB0RLQUMuudD4%3D'") import sys sys.path.append("unilm") sys.path.append("detectron2") import cv2 import filetype from PIL import Image import numpy as np from io import BytesIO from pdf2image import convert_from_bytes, convert_from_path import re import requests from urllib.parse import urlparse, parse_qs from unilm.dit.object_detection.ditod import add_vit_config import torch from detectron2.config import CfgNode as CN from detectron2.config import get_cfg from detectron2.utils.visualizer import ColorMode, Visualizer from detectron2.data import MetadataCatalog from detectron2.engine import DefaultPredictor from huggingface_hub import hf_hub_download import gradio as gr # Step 1: instantiate config cfg = get_cfg() add_vit_config(cfg) #cfg.merge_from_file("cascade_dit_base.yml") cfg.merge_from_file("unilm/dit/object_detection/publaynet_configs/cascade/cascade_dit_base.yaml") # Step 2: add model weights URL to config filepath = hf_hub_download(repo_id="Sebas6k/DiT_weights", filename="publaynet_dit-b_cascade.pth", repo_type="model") cfg.MODEL.WEIGHTS = filepath # Step 3: set device cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Step 4: define model predictor = DefaultPredictor(cfg) def analyze_image(img): md = MetadataCatalog.get(cfg.DATASETS.TEST[0]) if cfg.DATASETS.TEST[0]=='icdar2019_test': md.set(thing_classes=["table"]) else: md.set(thing_classes=["text","title","list","table","figure"]) ## these are categories from PubLayNet (PubMed PDF/XML data): https://ieeexplore.ieee.org/document/8977963 outputs = predictor(img) instances = outputs["instances"] # Ensure we're operating on CPU for numpy compatibility instances = instances.to("cpu") # Filter out figures based on class labels high_confidence = [] medium_confidence = [] low_confidence = [] for i in range(len(instances)): if md.thing_classes[instances.pred_classes[i]] == "figure": box = instances.pred_boxes.tensor[i].numpy().astype(int) cropped_img = img[box[1]:box[3], box[0]:box[2]] confidence_score = instances.scores[i].numpy() * 100 # convert to percentage confidence_text = f"Score: {confidence_score:.2f}%" # Overlay confidence score on the image # Enhanced label visualization with orange color font_scale = 0.9 font_thickness = 2 text_color = (255, 255, 255) # white background background_color = (255, 165, 0) # RGB for orange (text_width, text_height), _ = cv2.getTextSize(confidence_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness) padding = 12 text_offset_x = padding - 3 text_offset_y = cropped_img.shape[0] - padding + 2 box_coords = ((text_offset_x, text_offset_y + padding // 2), (text_offset_x + text_width + padding, text_offset_y - text_height - padding // 2)) cv2.rectangle(cropped_img, box_coords[0], box_coords[1], background_color, cv2.FILLED) cv2.putText(cropped_img, confidence_text, (text_offset_x, text_offset_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness) # Categorize images based on confidence levels if confidence_score > 85: high_confidence.append(cropped_img) elif confidence_score > 50: medium_confidence.append(cropped_img) else: low_confidence.append(cropped_img) v = Visualizer(img[:, :, ::-1], md, scale=1.0, instance_mode=ColorMode.SEGMENTATION) result_image = v.draw_instance_predictions(instances).get_image()[:, :, ::-1] return result_image, high_confidence, medium_confidence, low_confidence # output = predictor(img)["instances"] # v = Visualizer(img[:, :, ::-1], # md, # scale=1.0, # instance_mode=ColorMode.SEGMENTATION) # result = v.draw_instance_predictions(output.to("cpu")) # result_image = result.get_image()[:, :, ::-1] # ## figs = [img[box[1]:box[3], box[0]:box[2]] for box, cls in zip(output.pred_boxes, output.pred_classes) if md.thing_classes[cls] == "figure"] # # return result_image, figs def handle_input(input_data): images = [] #input_data is a dict with keys 'text' and 'files' if 'text' in input_data and input_data['text']: input_text = input_data['text'].strip() # this is either a URL or a PDF ID if input_text.startswith('http://') or input_text.startswith('https://'): # Extract the ID from the URL url_parts = urlparse(input_text) query_params = parse_qs(url_parts.fragment) # Assumes ID is a fragment parameter pdf_id = query_params.get('id', [None])[0] if not pdf_id: raise ValueError("PDF ID not found in URL") else: # Assume input is a direct PDF ID pdf_id = input_text if not re.match(r'^[a-zA-Z]{4}\d{4}$', pdf_id): raise ValueError("Invalid PDF ID format. Expected four letters followed by four numbers.") # Assume input is a PDF ID, convert to URL # Now construct the download URL pdf_url = construct_download_url(pdf_id) #https://download.industrydocuments.ucsf.edu/k/t/k/l/ktkl0236/ktkl0236.pdf # Assume input is a PDF URL pdf_data = download_pdf(pdf_url) images = pdf_to_images(pdf_data) if 'files' in input_data and input_data['files']: for file_path in input_data['files']: print("Type of file as uploaded:", type(file_path)) print(f" File: {file_path}") # Check if the input is a file and determine its type kind = filetype.guess(file_path) if kind.mime.startswith('image'): # Process a single image images.append(load_image(file_path)) # Process image directly elif kind.mime == 'application/pdf': # Convert PDF pages to images images.extend(pdf_to_images(file_path)) else: raise ValueError("Unsupported file type.") if not images: raise ValueError("No valid input provided. Please upload a file or enter a PDF ID.") # Assuming processing images returns galleries of images by confidence return process_images(images) def load_image(img_path): print(f"Loading image: {img_path}") # Load an image from a file path image = Image.open(img_path) if isinstance(image, Image.Image): image = np.array(image) # Convert PIL Image to numpy array # Ensure the image is in the correct format if image.ndim == 2: # Image is grayscale image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) elif image.ndim == 3 and image.shape[2] == 3: image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # image = image[:, :, ::-1] # Convert RGB to BGR if necessary return image def construct_download_url(pdf_id): # Construct the download URL from the PDF ID # https://download.examples.edu/k/t/k/l/ktkl0236/ktkl0236.pdf path_parts = '/'.join(pdf_id[i] for i in range(4)) # 'k/t/k/l' download_url = f"https://download.industrydocuments.ucsf.edu/{path_parts}/{pdf_id}/{pdf_id}.pdf" return download_url def download_pdf(pdf_url): # Download the PDF file from the given URL response = requests.get(pdf_url) response.raise_for_status() # Ensure we notice bad responses return BytesIO(response.content) def pdf_to_images(data_or_path): # Create a temporary directory to store the page images temp_dir = "temp_images" os.makedirs(temp_dir, exist_ok=True) try: # Convert PDF to a list of PIL images # Handle both BytesIO and file path input for PDF conversion if isinstance(data_or_path, BytesIO): # Convert directly from bytes pages = convert_from_bytes(data_or_path.read()) elif isinstance(data_or_path, str): # Convert from a file path pages = convert_from_path(data_or_path) # Save each page as an image file page_images = [] for i, page in enumerate(pages): image_path = os.path.join(temp_dir, f"page_{i+1}.jpg") page.save(image_path, "JPEG") page_images.append(load_image(image_path)) return page_images except Exception as e: print(f"Error converting PDF to images: {str(e)}") return [] finally: # Clean up the temporary directory (optional) # os.rmdir(temp_dir) pass def process_images(images): all_processed_images = [] all_high_confidence = [] all_medium_confidence = [] all_low_confidence = [] for img in images: #print("Type of img before processing:", type(img)) #print(f" img before processing: {img}") processed_images, high_confidence, medium_confidence, low_confidence = analyze_image(img) all_processed_images.append(processed_images) all_high_confidence.extend(high_confidence) all_medium_confidence.extend(medium_confidence) all_low_confidence.extend(low_confidence) return all_processed_images, all_high_confidence, all_medium_confidence, all_low_confidence title = "OIDA Image Collection Interactive demo: Document Layout Analysis with DiT and PubLayNet" description = "
Paper | Github Repo | HuggingFace doc | PubLayNet paper
" #examples =[['fpmj0236_Page_012.png'],['fnmf0234_Page_2.png'],['publaynet_example.jpeg'],['fpmj0236_Page_018.png'],['lrpw0232_Page_14.png'],['kllx0250'],['https://www.industrydocuments.ucsf.edu/opioids/docs/#id=yqgg0230']] examples =[{'files': ['fnmf0234_Page_2.png']},{'files': ['fpmj0236_Page_012.png']},{'files': ['lrpw0232.pdf']},{'text': 'https://www.industrydocuments.ucsf.edu/opioids/docs/#id=yqgg0230'},{'files':['fpmj0236_Page_018.png']},{'files':['lrpw0232_Page_14.png']},{'files':['publaynet_example.jpeg']},{'text':'kllx0250'},{'text':'txhk0255'}] #txhk0255 css = ".output-image, .input-image, .image-preview {height: 600px !important} td.textbox {display:none;} #component-5 .submit-button {display:none;}" #iface = gr.Interface(fn=handle_input, # inputs=gr.MultimodalTextbox(interactive=True, # label="Upload image/PDF file OR enter OIDA ID or URL", # file_types=["image",".pdf"], # placeholder="Upload image/PDF file OR enter OIDA ID or URL"), # outputs=[gr.Gallery(label="annotated documents"), # gr.Gallery(label="Figures with High (>85%) Confidence Scores"), # gr.Gallery(label="Figures with Moderate (50-85%) Confidence Scores"), # gr.Gallery(label="Figures with Lower Confidence (under 50%) Scores")], # title=title, # description=description, # examples=examples, # article=article, # css=css) ## enable_queue=True) with gr.Blocks(css=css) as iface: gr.Markdown(f"# {title}") gr.HTML(description) with gr.Row(): with gr.Column(): input = gr.MultimodalTextbox(interactive=True, label="Upload image/PDF file OR enter OIDA ID or URL", file_types=["image",".pdf"], placeholder="Upload image/PDF file OR enter OIDA ID or URL", submit_btn=None) submit_btn = gr.Button("Submit") gr.HTML('