Spaces:

pierreguillou
/

DocLayNet-image-viewer

Runtime error

App Files Files Community

pierreguillou commited on Jan 30, 2023

Commit

d0e0e62

1 Parent(s): 9361e03

Create app.py

Browse files

Files changed (1) hide show

app.py +313 -0

app.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import gradio as gr
+from PIL import Image, ImageDraw, ImageFont
+import random
+import pandas as pd
+import numpy as np
+from datasets import concatenate_datasets
+from operator import itemgetter
+import collections
+# download datasets
+from datasets import load_dataset
+dataset_small = load_dataset("pierreguillou/DocLayNet-small")
+dataset_base = load_dataset("pierreguillou/DocLayNet-base")
+id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
+labels = [label for idx, label in id2label.items()]
+# need to change the coordinates format
+def convert_box(box):
+    x, y, w, h = tuple(box) # the row comes in (left, top, width, height) format
+    actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box
+    return actual_box
+# get back original size
+def original_box(box, original_width, original_height, coco_width, coco_height):
+    return [
+        int(original_width * (box[0] / coco_width)),
+        int(original_height * (box[1] / coco_height)),
+        int(original_width * (box[2] / coco_width)),
+        int(original_height* (box[3] / coco_height)),
+    ]
+# function to sort bounding boxes
+def get_sorted_boxes(bboxes):
+  # sort by y from page top to bottom
+  bboxes = sorted(bboxes, key=itemgetter(1), reverse=False)
+  y_list = [bbox[1] for bbox in bboxes]
+  # sort by x from page left to right when boxes with same y
+  if len(list(set(y_list))) != len(y_list):
+    y_list_duplicates_indexes = dict()
+    y_list_duplicates = [item for item, count in collections.Counter(y_list).items() if count > 1]
+    for item in y_list_duplicates:
+      y_list_duplicates_indexes[item] = [i for i, e in enumerate(y_list) if e == item]
+      bbox_list_y_duplicates = sorted(np.array(bboxes)[y_list_duplicates_indexes[item]].tolist(), key=itemgetter(0), reverse=False)
+      np_array_bboxes = np.array(bboxes)
+      np_array_bboxes[y_list_duplicates_indexes[item]] = np.array(bbox_list_y_duplicates)
+      bboxes = np_array_bboxes.tolist()
+  return bboxes
+# categories colors
+label2color = {
+    'Caption': 'brown',
+    'Footnote': 'orange',
+    'Formula': 'gray',
+    'List-item': 'yellow',
+    'Page-footer': 'red',
+    'Page-header': 'red',
+    'Picture': 'violet',
+    'Section-header': 'orange',
+    'Table': 'green',
+    'Text': 'blue',
+    'Title': 'pink'
+    }
+# image witout content
+examples_dir = 'samples/'
+images_wo_content = examples_dir + "wo_content.png"
+df_paragraphs_wo_content, df_lines_wo_content = pd.DataFrame(), pd.DataFrame()
+df_paragraphs_wo_content["paragraphs"] = [0]
+df_paragraphs_wo_content["categories"] = ["no content"]
+df_paragraphs_wo_content["texts"] = ["no content"]
+df_paragraphs_wo_content["bounding boxes"] = ["no content"]
+df_lines_wo_content["lines"] = [0]
+df_lines_wo_content["categories"] = ["no content"]
+df_lines_wo_content["texts"] = ["no content"]
+df_lines_wo_content["bounding boxes"] = ["no content"]
+# lists
+font = ImageFont.load_default()
+dataset_names = ["small", "base"]
+splits = ["all", "train", "validation", "test"]
+domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"]
+domains_names = [domain_name.lower().replace(" ", "_") for domain_name in domains]
+categories = labels + ["all"]
+# function to get a rendom image and all data from DocLayNet
+def generate_annotated_image(dataset_name, split, domain, category):
+  def get_dataset(dataset_name, split, domain, category):
+    # error message
+    msg_error = ""
+    # get dataset
+    if dataset_name == "small": example = dataset_small
+    else: example = dataset_base
+    # get split
+    if split == "all":
+      example = concatenate_datasets([example["train"], example["validation"], example["test"]])
+    else:
+      example = example[split]
+    # get domain
+    domain_name = domains_names[domains.index(domain)]
+    if domain_name != "all":
+      example = example.filter(lambda example: example["doc_category"] == domain_name)
+      if len(example) == 0:
+        msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters ("{domain}" domain / "DocLayNet {dataset_name}" dataset splitted into "{split}").'
+        example = dict()
+        return example, msg_error
+    # get category
+    idx_list = list()
+    if category != "all":
+      for idx, categories_list in zip(example["id"], example["categories"]):
+        if category in categories_list:
+          idx_list.append(idx)
+      example = example.select(idx_list)
+      if len(example) == 0:
+        msg_error = f'There is no image with at least one annotated bounding box that matches to your parameters (category: "{category}" / domain: "{domain}" / dataset: "DocLayNet {dataset_name}" / split: "{split}").'
+        example = dict()
+        return example, msg_error
+    return example, msg_error
+  # get results
+  example, msg_error = get_dataset(dataset_name, split, domain, category)
+  if len(msg_error) > 0:
+    return msg_error, images_wo_content, images_wo_content, df_paragraphs_wo_content, df_lines_wo_content
+  else:
+    # get random image & PDF data
+    image_files = example["image"]
+    index = random.randint(0, len(image_files))
+    image = image_files[index] # original image
+    coco_width, coco_height = example[index]["coco_width"], example[index]["coco_height"]
+    original_width, original_height = example[index]["original_width"], example[index]["original_height"]
+    original_filename = example[index]["original_filename"]
+    page_no = example[index]["page_no"]
+    num_pages = example[index]["num_pages"]
+    # resize image to original
+    image = image.resize((original_width, original_height))
+    # get corresponding annotations
+    texts = example[index]["texts"]
+    bboxes_block = example[index]["bboxes_block"]
+    bboxes_line = example[index]["bboxes_line"]
+    categories = example[index]["categories"]
+    domain = example[index]["doc_category"]
+    # get list of categories
+    categories_unique = sorted(list(set([categories_list for categories_list in categories])))
+    categories_unique = [id2label[idx] for idx in categories_unique]
+    # convert boxes to original
+    original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block]
+    original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line]
+    original_bboxes = [original_bboxes_block, original_bboxes_line]
+    ##### block boxes #####
+    # get list of unique block boxes
+    original_blocks = dict()
+    original_bboxes_block_list = list()
+    original_bbox_block_prec = list()
+    for count_block, original_bbox_block in enumerate(original_bboxes_block):
+      if original_bbox_block != original_bbox_block_prec:
+        original_bbox_block_indexes = [i for i, original_bbox in enumerate(original_bboxes_block) if original_bbox == original_bbox_block]
+        original_blocks[count_block] = original_bbox_block_indexes
+        original_bboxes_block_list.append(original_bbox_block)
+      original_bbox_block_prec = original_bbox_block
+    # get list of categories and texts by unique block boxes
+    category_block_list, text_block_list = list(), list()
+    for original_bbox_block in original_bboxes_block_list:
+      count_block = original_bboxes_block.index(original_bbox_block)
+      original_bbox_block_indexes = original_blocks[count_block]
+      category_block = categories[original_bbox_block_indexes[0]]
+      category_block_list.append(category_block)
+      if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote":
+        text_block = ' '.join(np.array(texts)[original_bbox_block_indexes].tolist())
+      elif id2label[category_block] == "Section-header" or id2label[category_block] == "Title" or id2label[category_block] == "Picture" or id2label[category_block] == "Formula" or id2label[category_block] == "List-item" or id2label[category_block] == "Table" or id2label[category_block] == "Page-header" or id2label[category_block] == "Page-footer":
+        text_block = '\n'.join(np.array(texts)[original_bbox_block_indexes].tolist())
+      text_block_list.append(text_block)
+    # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
+    sorted_original_bboxes_block_list = get_sorted_boxes(original_bboxes_block_list)
+    sorted_original_bboxes_block_list_indexes = [original_bboxes_block_list.index(item) for item in sorted_original_bboxes_block_list]
+    sorted_category_block_list = np.array(category_block_list)[sorted_original_bboxes_block_list_indexes].tolist()
+    sorted_text_block_list = np.array(text_block_list)[sorted_original_bboxes_block_list_indexes].tolist()
+    ##### line boxes ####
+    # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
+    original_bboxes_line_list = original_bboxes_line
+    category_line_list = categories
+    text_line_list = texts
+    sorted_original_bboxes_line_list = get_sorted_boxes(original_bboxes_line_list)
+    sorted_original_bboxes_line_list_indexes = [original_bboxes_line_list.index(item) for item in sorted_original_bboxes_line_list]
+    sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
+    sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
+    # setup images & PDf data
+    columns = 2
+    images = [image.copy(), image.copy()]
+    num_imgs = len(images)
+    imgs, df_paragraphs, df_lines = dict(), pd.DataFrame(), pd.DataFrame()
+    for i, img in enumerate(images):
+        draw = ImageDraw.Draw(img)
+        for box, label_idx, text in zip(original_bboxes[i], categories, texts):
+            label = id2label[label_idx]
+            color = label2color[label]
+            draw.rectangle(box, outline=color)
+            text = text.encode('latin-1', 'replace').decode('latin-1') # https://stackoverflow.com/questions/56761449/unicodeencodeerror-latin-1-codec-cant-encode-character-u2013-writing-to
+            draw.text((box[0] + 10, box[1] - 10), text=label, fill=color, font=font)
+        if i == 0:
+          imgs["paragraphs"] = img
+          df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list)))
+          df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list]
+          df_paragraphs["texts"] = sorted_text_block_list
+          df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list]
+        else:
+          imgs["lines"] = img
+          df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list)))
+          df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list]
+          df_lines["texts"] = sorted_text_line_list
+          df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
+    msg = f'The page {page_no} of PDF "{original_filename}" (domain "{domain}") matches your parameters.'
+    return msg, imgs["paragraphs"], imgs["lines"], df_paragraphs, df_lines
+# gradio APP
+with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
+    gr.HTML("""
+    <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>DocLayNet image viewer</h1></div>
+    <div style="margin-top: 20px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset.</p></div>
+    <div><p>It uses the datasets <a href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> and <a href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>.</p></div>
+    <div><p>Make your parameters selections and the output will show 2 images of a randomly selected PDF with annotated bounding boxes, one of paragraphs and the other of lines, and a table of texts with their labels.</p></div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            dataset_name_gr = gr.Radio(dataset_names, value="small", label="DocLayNet dataset")
+        with gr.Column():
+            split_gr = gr.Dropdown(splits, value="all", label="Split")
+        with gr.Column():
+            domain_gr = gr.Dropdown(domains, value="all", label="Domain")
+        with gr.Column():
+            category_gr = gr.Dropdown(categories, value="all", label="Category")
+    btn = gr.Button("Display PDF image")
+    with gr.Row():
+      output_msg = gr.Textbox(label="Results")
+    with gr.Row():
+        # with gr.Column():
+        #   json = gr.JSON(label="JSON")
+        with gr.Column():
+          img_paragraphs = gr.Image(type="pil", label="Bounding boxes of paragraphs")
+        with gr.Column():
+          img_lines = gr.Image(type="pil", label="Bounding boxes of lines")
+    with gr.Row():
+      with gr.Column():
+        df_paragraphs = gr.Dataframe(
+            headers=["paragraphs", "categories", "texts", "bounding boxes"],
+            datatype=["number", "str", "str", "str"],
+            # row_count='dynamic',
+            col_count=(4, "fixed"),
+            interactive=False,
+            label="Paragraphs data",
+            type="pandas",
+            wrap=True
+          )
+      with gr.Column():
+          df_lines = gr.Dataframe(
+              headers=["lines", "categories", "texts", "bounding boxes"],
+              datatype=["number", "str", "str", "str"],
+              # row_count='dynamic',
+              col_count=(4, "fixed"),
+              interactive=False,
+              label="Lines data",
+              type="pandas",
+              wrap=True
+            )
+    btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_paragraphs, img_lines, df_paragraphs, df_lines])
+    gr.Markdown("## Example")
+    gr.Examples(
+        [["small", "all", "all", "all"]],
+        [dataset_name_gr, split_gr, domain_gr, category_gr],
+        [output_msg, img_paragraphs, img_lines, df_paragraphs, df_lines],
+        fn=generate_annotated_image,
+        cache_examples=True,
+    )
+demo.launch()