File size: 3,279 Bytes
ade70cf
1d51385
d69fd19
d7f29ce
 
ef3da92
d7f29ce
 
39ae23a
 
ade70cf
50fae8a
d502400
50fae8a
ade70cf
d256f3b
833928a
 
 
ca16909
d256f3b
 
 
beec895
50fae8a
1d51385
 
 
 
69958d1
ade70cf
 
 
 
 
 
 
 
 
 
1d51385
 
ade70cf
 
c8f76e0
 
 
6172e67
c8f76e0
 
ccd3ca3
c8f76e0
6172e67
 
 
 
 
 
 
 
 
 
833928a
6172e67
 
1d51385
6172e67
 
 
 
 
 
 
 
 
a8f49dd
8b2d7f4
ef3da92
8b2d7f4
6172e67
8b2d7f4
ccd3ca3
6172e67
 
833928a
6172e67
 
ccd3ca3
6172e67
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces


from PIL import Image 


import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

model = AutoModelForCausalLM.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True).to("cuda").eval()

processor = AutoProcessor.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True)


TITLE = "# [Florence-2-DocVQA Demo](https://huggingface.co/HuggingFaceM4/Florence-2-DocVQA)"
DESCRIPTION = "The demo for Florence-2 fine-tuned on DocVQA dataset. You can find the notebook [here](https://colab.research.google.com/drive/1hKDrJ5AH_o7I95PtZ9__VlCTNAo1Gjpf?usp=sharing). Read more about Florence-2 fine-tuning [here](finetune-florence2)."


colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
            'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']

@spaces.GPU
def run_example(task_prompt, image, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )
    return parsed_answer

def process_image(image, text_input=None):
    image = Image.fromarray(image)  # Convert NumPy array to PIL Image
    task_prompt = '<DocVQA>'
    results = run_example(task_prompt, image, text_input)[task_prompt].replace("<pad>", "")
    return results


css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(TITLE)
    gr.Markdown(DESCRIPTION)
    with gr.Tab(label="Florence-2 Image Captioning"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                text_input = gr.Textbox(label="Text Input (optional)")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        gr.Examples(
            examples=[
                ["hunt.jpg", 'What is this image?'],
                ["idefics2_architecture.png", 'How many tokens per image does it use?'],
                ["idefics2_architecture.png", "What type of encoder does the model use?"],
                ["image.jpg", "What's the share of Industry Switchers Gained?"]
            ],
            inputs=[input_img, text_input],
            outputs=[output_text],
            fn=process_image,
            cache_examples=True,
            label='Try the examples below'
        )

        submit_btn.click(process_image, [input_img, text_input], [output_text])

demo.launch(debug=True)