Spaces:

laicsiifes
/

veds-image-captioning

Sleeping

File size: 3,858 Bytes

70de5aa
 
 
 
ae059f3
70de5aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6d576b
 
 
 
 
 
70de5aa
ae059f3
70de5aa
ae059f3
70de5aa
 
 
 
 
 
 
0a3520e
465440b
 
0a3520e
465440b
 
0a3520e
70de5aa
 
 
 
 
 
 
 
ae059f3
70de5aa
 
 
 
 
 
 
 
 
77f055e
70de5aa
 
 
 
 
 
 
 
 
 
 
 
 
 
ae059f3
70de5aa
 
ae059f3

import requests 
from PIL import Image, UnidentifiedImageError
from transformers import AutoTokenizer, AutoImageProcessor, VisionEncoderDecoderModel
import gradio as gr
import os

# Load the model, tokenizer, and image processor with error handling
def load_model_and_components(model_name):
    try:
        model = VisionEncoderDecoderModel.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        image_processor = AutoImageProcessor.from_pretrained(model_name)
        return model, tokenizer, image_processor
    except Exception as e:
        raise RuntimeError(f"Error loading model components: {e}")

current_model_name = "laicsiifes/swin-distilbertimbau"
model, tokenizer, image_processor = load_model_and_components(current_model_name)

# Function to process the image and generate a caption
def generate_caption(image):
    try:
        pixel_values = image_processor(image, return_tensors="pt").pixel_values
        generated_ids = model.generate(pixel_values)
        caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return caption
    except Exception:
        return "Please upload a valid image."

# Predefined images for selection
image_folder = "images"
predefined_images_paths = [
    os.path.join(image_folder, fname) for fname in os.listdir(image_folder) if fname.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))
]

# Gradio app
def app(image=None):
    try:
        if image is None:
            return "Please upload a valid image."
        return generate_caption(image)
    except Exception:
        return "Please upload a valid image."

# Define UI
with gr.Blocks() as interface:
    gr.Markdown("""
        # Welcome to the LAICSI-IFES space for Vision Encoder-Decoder (VED) demonstration
        
        ---
        
        ### Be patient with the Swin-GPorTuguese-2 as it is heavier than the Swin-DistilBERTimbau.
    """)
    with gr.Row():
        with gr.Column():
            model_selector = gr.Dropdown(choices=["laicsiifes/swin-distilbertimbau", "laicsiifes/swin-gportuguese-2"], 
                                         value="laicsiifes/swin-distilbertimbau", 
                                         label="Select Model")
            loading_message = gr.Textbox(label="Status Message")
            image_display = gr.Image(type="pil", label="Image Preview", interactive=False)
            upload_button = gr.File(label="Upload an Image", file_types=["image"], type="filepath")
            examples = gr.Examples(predefined_images_paths, inputs=[upload_button], label="Examples")

        with gr.Column():
            output_text = gr.Textbox(label="Generated Caption")

    # Define logic
    def handle_uploaded_image(image):
        try:
            if image is None:
                return None, "Please upload a valid image."
            pil_image = Image.open(image).convert("RGB")
            return pil_image, generate_caption(pil_image)
        except Exception:
            return None, "Please upload a valid image."

    def switch_model(selected_model):
        gr.Info("Loading model... Please wait.")
        return "Loading model... Please wait.", None, None, None

    def load_new_model(selected_model):
        global model, tokenizer, image_processor
        model, tokenizer, image_processor = load_model_and_components(selected_model)
        return "Model loaded successfully.", None, None, None

    model_selector.change(fn=switch_model, inputs=model_selector, outputs=[loading_message, upload_button, image_display, output_text])
    model_selector.change(fn=load_new_model, inputs=model_selector, outputs=[loading_message, image_display, output_text])
    upload_button.change(fn=handle_uploaded_image, inputs=upload_button, outputs=[image_display, output_text])

    interface.launch(share=False)