Image2Paragraph / app.py
Awiny's picture
update gradio ui
40adb4f
raw
history blame
4.14 kB
import gradio as gr
import cv2
import numpy as np
from PIL import Image
import base64
from io import BytesIO
from models.image_text_transformation import ImageTextTransformation
import argparse
import torch
parser = argparse.ArgumentParser()
parser.add_argument('--gpt_version', choices=['gpt-3.5-turbo', 'gpt4'], default='gpt-3.5-turbo')
parser.add_argument('--image_caption', action='store_true', dest='image_caption', default=True, help='Set this flag to True if you want to use BLIP2 Image Caption')
parser.add_argument('--dense_caption', action='store_true', dest='dense_caption', default=True, help='Set this flag to True if you want to use Dense Caption')
parser.add_argument('--semantic_segment', action='store_true', dest='semantic_segment', default=False, help='Set this flag to True if you want to use semantic segmentation')
parser.add_argument('--image_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
parser.add_argument('--dense_caption_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, < 6G GPU is not recommended>')
parser.add_argument('--semantic_segment_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, gpu memory larger than 14G is recommended')
parser.add_argument('--contolnet_device', choices=['cuda', 'cpu'], default='cpu', help='Select the device: cuda or cpu, <6G GPU is not recommended>')
args = parser.parse_args()
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
args.image_caption_device = "cuda"
args.dense_caption_device = "cuda"
args.semantic_segment_device = "cuda"
args.contolnet_device = "cuda"
else:
args.image_caption_device = "cpu"
args.dense_caption_device = "cpu"
args.semantic_segment_device = "cpu"
args.contolnet_device = "cpu"
def pil_image_to_base64(image):
buffered = BytesIO()
image.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode()
return img_str
def add_logo():
with open("examples/logo.png", "rb") as f:
logo_base64 = base64.b64encode(f.read()).decode()
return logo_base64
def process_image(image_src, options, processor):
processor.args.semantic_segment = "Semantic Segment" in options
gen_text = processor.image_to_text(image_src)
gen_image = processor.text_to_image(gen_text)
gen_image_str = pil_image_to_base64(gen_image)
# Combine the outputs into a single HTML output
custom_output = f'''
<h2>Image->Text->Image:</h2>
<div style="display: flex; flex-wrap: wrap;">
<div style="flex: 1;">
<h3>Image2Text</h3>
<p>{gen_text}</p>
</div>
<div style="flex: 1;">
<h3>Text2Image</h3>
<img src="data:image/jpeg;base64,{gen_image_str}" width="100%" />
</div>
</div>
'''
return custom_output
processor = ImageTextTransformation(args)
# Create Gradio input and output components
image_input = gr.inputs.Image(type='filepath', label="Input Image")
semantic_segment_checkbox = gr.inputs.Checkbox(label="Semantic Segment", default=False)
logo_base64 = add_logo()
# Create the title with the logo
title_with_logo = f'<img src="data:image/jpeg;base64,{logo_base64}" width="400" style="vertical-align: middle;"> Understanding Image with Text'
# Create Gradio interface
interface = gr.Interface(
fn=lambda image, options: process_image(image, options, processor),
inputs=[image_input,
gr.CheckboxGroup(
label="Options",
choices=["Semantic Segment"],
),
],
outputs=gr.outputs.HTML(),
title=title_with_logo,
description="""
This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
\n Semantic segment is very slow in cpu(~8m), best use on gpu or run local.
\n Notice the text2image model is controlnet, which used canny edge as reference.
"""
)
# Launch the interface
interface.launch()