justinj92 commited on
Commit
39bd209
·
verified ·
1 Parent(s): 5c757cb

Upload 7 files

Browse files
Files changed (7) hide show
  1. app.py +171 -0
  2. requirements.txt +7 -0
  3. utils/__init__.py +0 -0
  4. utils/annotate.py +17 -0
  5. utils/imports.py +13 -0
  6. utils/models.py +73 -0
  7. utils/tasks.py +79 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, Optional
2
+
3
+ import gradio as gr
4
+ import spaces
5
+ import supervision as sv
6
+ import torch
7
+ from PIL import Image
8
+ from gradio_image_prompter import ImagePrompter
9
+
10
+ from utils.annotate import annotate_with_boxes
11
+ from utils.models import load_models, run_inference, CHECKPOINTS, \
12
+ pre_process_region_task_input, post_process_region_output
13
+ from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
14
+ CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
15
+ MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
16
+ IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
17
+ TEXTBOX_OUTPUT_TASK_NAMES, IMAGE_TO_IMAGE_TASK_NAMES, IMAGE_TO_TEXT_TASK_NAMES, \
18
+ IMAGE_PROMPT_TO_IMAGE_TASK_NAMES, REGION_PROPOSAL_TASK_NAME, \
19
+ DENSE_REGION_CAPTION_TASK_NAME
20
+
21
+ MARKDOWN = """
22
+ # Florence-2 🔥
23
+
24
+ Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the
25
+ MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities
26
+ across tasks such as captioning, object detection, grounding, and segmentation.
27
+ The model takes images and task prompts as input, generating the desired results in
28
+ text format. It uses a DaViT vision encoder to convert images into visual token
29
+ embeddings. These are then concatenated with BERT-generated text embeddings and
30
+ processed by a transformer-based multi-modal encoder-decoder to generate the response.
31
+ """
32
+ EXAMPLES = [
33
+ ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
34
+ ["microsoft/Florence-2-large-ft", REGION_PROPOSAL_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
35
+ ["microsoft/Florence-2-large-ft", DENSE_REGION_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
36
+ ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
37
+ ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
38
+ ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg?download=true", None],
39
+ ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
40
+ ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
41
+ ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/temple-bar-dublin-world-famous-irish-pub.jpg?download=true", None],
42
+ ]
43
+
44
+ # DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
+ DEVICE = "cuda"
46
+ MODELS, PROCESSORS = load_models(DEVICE)
47
+
48
+
49
+ @spaces.GPU
50
+ def process(
51
+ checkpoint_dropdown,
52
+ task_dropdown,
53
+ image_input,
54
+ image_prompter_input
55
+ ) -> Tuple[Optional[Image.Image], Optional[str]]:
56
+ model = MODELS[checkpoint_dropdown]
57
+ processor = PROCESSORS[checkpoint_dropdown]
58
+ task = TASKS[task_dropdown]
59
+
60
+ if task_dropdown in IMAGE_TO_IMAGE_TASK_NAMES:
61
+ _, response = run_inference(
62
+ model, processor, DEVICE, image_input, task)
63
+ detections = sv.Detections.from_lmm(
64
+ lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
65
+ return annotate_with_boxes(image_input, detections), None
66
+
67
+ elif task_dropdown in IMAGE_TO_TEXT_TASK_NAMES:
68
+ _, response = run_inference(
69
+ model, processor, DEVICE, image_input, task)
70
+ return None, response[task]
71
+
72
+ elif task_dropdown in IMAGE_PROMPT_TO_IMAGE_TASK_NAMES:
73
+ detections_list = []
74
+
75
+ print(image_prompter_input)
76
+
77
+ image_input = image_prompter_input["image"]
78
+ for prompt in image_prompter_input["points"]:
79
+ text = pre_process_region_task_input(
80
+ prompt=prompt,
81
+ resolution_wh=image_input.size
82
+ )
83
+ _, response = run_inference(
84
+ model, processor, DEVICE, image_input, task, text)
85
+ detections = sv.Detections.from_lmm(
86
+ lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
87
+ detections_list.append(detections)
88
+ detections = sv.Detections.merge(detections_list=detections_list)
89
+ detections = post_process_region_output(
90
+ detections=detections, resolution_wh=image_input.size)
91
+
92
+ return annotate_with_boxes(image_input, detections), None
93
+
94
+
95
+ with gr.Blocks() as demo:
96
+ gr.Markdown(MARKDOWN)
97
+ with gr.Row():
98
+ checkpoint_dropdown_component = gr.Dropdown(
99
+ choices=CHECKPOINTS,
100
+ value=CHECKPOINTS[0],
101
+ label="Model", info="Select a Florence 2 model to use.",
102
+ interactive=True
103
+ )
104
+ task_dropdown_component = gr.Dropdown(
105
+ choices=TASK_NAMES,
106
+ value=TASK_NAMES[0],
107
+ label="Task", info="Select a task to perform with the model.",
108
+ interactive=True
109
+ )
110
+
111
+ with gr.Row():
112
+ with gr.Column():
113
+ image_input_component = gr.Image(
114
+ type='pil', label='Upload image')
115
+ image_prompter_input_component = ImagePrompter(
116
+ type='pil', label='Image prompt', visible=False)
117
+ submit_button_component = gr.Button(value='Submit', variant='primary')
118
+
119
+ with gr.Column():
120
+ image_output_component = gr.Image(type='pil', label='Image Output')
121
+ text_output_component = gr.Textbox(label='Caption Output', visible=False)
122
+ with gr.Row():
123
+ gr.Examples(
124
+ fn=process,
125
+ examples=EXAMPLES,
126
+ inputs=[
127
+ checkpoint_dropdown_component,
128
+ task_dropdown_component,
129
+ image_input_component,
130
+ image_prompter_input_component
131
+ ],
132
+ outputs=[
133
+ image_output_component,
134
+ text_output_component
135
+ ],
136
+ run_on_click=True
137
+ )
138
+
139
+ def on_dropdown_change(text):
140
+ return [
141
+ gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
142
+ ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
143
+ gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
144
+ gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
145
+ ]
146
+
147
+ task_dropdown_component.change(
148
+ on_dropdown_change,
149
+ inputs=[task_dropdown_component],
150
+ outputs=[
151
+ image_input_component,
152
+ image_prompter_input_component,
153
+ image_output_component,
154
+ text_output_component
155
+ ]
156
+ )
157
+ submit_button_component.click(
158
+ fn=process,
159
+ inputs=[
160
+ checkpoint_dropdown_component,
161
+ task_dropdown_component,
162
+ image_input_component,
163
+ image_prompter_input_component
164
+ ],
165
+ outputs=[
166
+ image_output_component,
167
+ text_output_component
168
+ ]
169
+ )
170
+
171
+ demo.launch(debug=False, show_error=True, max_threads=1)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ einops
2
+ spaces
3
+ timm
4
+ gradio
5
+ transformers
6
+ gradio-image-prompter
7
+ supervision==0.22.0rc1
utils/__init__.py ADDED
File without changes
utils/annotate.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import supervision as sv
2
+ from PIL import Image
3
+
4
+
5
+ def annotate_with_boxes(image: Image, detections: sv.Detections) -> Image:
6
+ annotated_image = image.copy()
7
+ thickness = sv.calculate_optimal_line_thickness(resolution_wh=image.size)
8
+ text_scale = sv.calculate_optimal_text_scale(resolution_wh=image.size)
9
+ bounding_box_annotator = sv.BoundingBoxAnnotator(
10
+ color_lookup=sv.ColorLookup.INDEX, thickness=thickness)
11
+ label_annotator = sv.LabelAnnotator(
12
+ color_lookup=sv.ColorLookup.INDEX,
13
+ text_scale=text_scale,
14
+ text_thickness=thickness)
15
+ annotated_image = bounding_box_annotator.annotate(annotated_image, detections)
16
+ annotated_image = label_annotator.annotate(annotated_image, detections)
17
+ return annotated_image
utils/imports.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from typing import Union
4
+ from transformers.dynamic_module_utils import get_imports
5
+
6
+
7
+ def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
8
+ """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
9
+ if not str(filename).endswith("/modeling_florence2.py"):
10
+ return get_imports(filename)
11
+ imports = get_imports(filename)
12
+ imports.remove("flash_attn")
13
+ return imports
utils/models.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, Dict, Any, List
2
+ from unittest.mock import patch
3
+
4
+ import numpy as np
5
+ import supervision as sv
6
+ import torch
7
+ from PIL import Image
8
+ from transformers import AutoModelForCausalLM, AutoProcessor
9
+
10
+ from utils.imports import fixed_get_imports
11
+
12
+ CHECKPOINTS = [
13
+ "microsoft/Florence-2-large-ft",
14
+ "microsoft/Florence-2-large",
15
+ "microsoft/Florence-2-base-ft",
16
+ "microsoft/Florence-2-base",
17
+ ]
18
+
19
+
20
+ def load_models(device: torch.device) -> Tuple[Dict[str, Any], Dict[str, Any]]:
21
+ with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
22
+ models = {}
23
+ processors = {}
24
+ for checkpoint in CHECKPOINTS:
25
+ models[checkpoint] = AutoModelForCausalLM.from_pretrained(
26
+ checkpoint, trust_remote_code=True).to(device).eval()
27
+ processors[checkpoint] = AutoProcessor.from_pretrained(
28
+ checkpoint, trust_remote_code=True)
29
+ return models, processors
30
+
31
+
32
+ def run_inference(
33
+ model: Any,
34
+ processor: Any,
35
+ device: torch.device,
36
+ image: Image,
37
+ task: str,
38
+ text: str = ""
39
+ ) -> Tuple[str, Dict]:
40
+ prompt = task + text
41
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
42
+ generated_ids = model.generate(
43
+ input_ids=inputs["input_ids"],
44
+ pixel_values=inputs["pixel_values"],
45
+ max_new_tokens=1024,
46
+ num_beams=3
47
+ )
48
+ generated_text = processor.batch_decode(
49
+ generated_ids, skip_special_tokens=False)[0]
50
+ response = processor.post_process_generation(
51
+ generated_text, task=task, image_size=image.size)
52
+ return generated_text, response
53
+
54
+
55
+ def pre_process_region_task_input(
56
+ prompt: List[float],
57
+ resolution_wh: Tuple[int, int]
58
+ ) -> str:
59
+ x1, y1, _, x2, y2, _ = prompt
60
+ w, h = resolution_wh
61
+ box = np.array([x1, y1, x2, y2])
62
+ box /= np.array([w, h, w, h])
63
+ box *= 1000
64
+ return "".join([f"<loc_{int(coordinate)}>" for coordinate in box])
65
+
66
+
67
+ def post_process_region_output(
68
+ detections: sv.Detections,
69
+ resolution_wh: Tuple[int, int]
70
+ ) -> sv.Detections:
71
+ w, h = resolution_wh
72
+ detections.xyxy = (detections.xyxy / 1000 * np.array([w, h, w, h])).astype(np.int32)
73
+ return detections
utils/tasks.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OBJECT_DETECTION_TASK_NAME = "Object Detection"
2
+ REGION_PROPOSAL_TASK_NAME = "Region Proposal"
3
+ DENSE_REGION_CAPTION_TASK_NAME = "Dense Region Caption"
4
+ CAPTION_TASK_NAME = "Caption"
5
+ DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
6
+ MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
7
+ OCR_TASK_NAME = "OCR"
8
+ OCR_WITH_REGION_TASK_NAME = "OCR with Region"
9
+ REGION_TO_CATEGORY_TASK_NAME = "Region to Category"
10
+ REGION_TO_DESCRIPTION_TASK_NAME = "Region to Description"
11
+
12
+ TASK_NAMES = [
13
+ OBJECT_DETECTION_TASK_NAME,
14
+ REGION_PROPOSAL_TASK_NAME,
15
+ DENSE_REGION_CAPTION_TASK_NAME,
16
+ CAPTION_TASK_NAME,
17
+ DETAILED_CAPTION_TASK_NAME,
18
+ MORE_DETAILED_CAPTION_TASK_NAME,
19
+ OCR_TASK_NAME,
20
+ OCR_WITH_REGION_TASK_NAME,
21
+ REGION_TO_CATEGORY_TASK_NAME,
22
+ REGION_TO_DESCRIPTION_TASK_NAME
23
+ ]
24
+ TASKS = {
25
+ OBJECT_DETECTION_TASK_NAME: "<OD>",
26
+ REGION_PROPOSAL_TASK_NAME: "<REGION_PROPOSAL>",
27
+ DENSE_REGION_CAPTION_TASK_NAME: "<DENSE_REGION_CAPTION>",
28
+ CAPTION_TASK_NAME: "<CAPTION>",
29
+ DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
30
+ MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>",
31
+ OCR_TASK_NAME: "<OCR>",
32
+ OCR_WITH_REGION_TASK_NAME: "<OCR_WITH_REGION>",
33
+ REGION_TO_CATEGORY_TASK_NAME: "<REGION_TO_CATEGORY>",
34
+ REGION_TO_DESCRIPTION_TASK_NAME: "<REGION_TO_DESCRIPTION>"
35
+ }
36
+ IMAGE_INPUT_TASK_NAMES = [
37
+ OBJECT_DETECTION_TASK_NAME,
38
+ REGION_PROPOSAL_TASK_NAME,
39
+ DENSE_REGION_CAPTION_TASK_NAME,
40
+ CAPTION_TASK_NAME,
41
+ DETAILED_CAPTION_TASK_NAME,
42
+ MORE_DETAILED_CAPTION_TASK_NAME,
43
+ OCR_TASK_NAME,
44
+ OCR_WITH_REGION_TASK_NAME,
45
+ ]
46
+ IMAGE_PROMPTER_INPUT_TASK_NAMES = [
47
+ REGION_TO_CATEGORY_TASK_NAME,
48
+ REGION_TO_DESCRIPTION_TASK_NAME
49
+ ]
50
+ IMAGE_OUTPUT_TASK_NAMES = [
51
+ OBJECT_DETECTION_TASK_NAME,
52
+ REGION_PROPOSAL_TASK_NAME,
53
+ DENSE_REGION_CAPTION_TASK_NAME,
54
+ OCR_WITH_REGION_TASK_NAME,
55
+ REGION_TO_CATEGORY_TASK_NAME,
56
+ REGION_TO_DESCRIPTION_TASK_NAME
57
+ ]
58
+ TEXTBOX_OUTPUT_TASK_NAMES = [
59
+ CAPTION_TASK_NAME,
60
+ DETAILED_CAPTION_TASK_NAME,
61
+ MORE_DETAILED_CAPTION_TASK_NAME,
62
+ OCR_TASK_NAME
63
+ ]
64
+ IMAGE_TO_IMAGE_TASK_NAMES = [
65
+ OBJECT_DETECTION_TASK_NAME,
66
+ OCR_WITH_REGION_TASK_NAME,
67
+ REGION_PROPOSAL_TASK_NAME,
68
+ DENSE_REGION_CAPTION_TASK_NAME
69
+ ]
70
+ IMAGE_TO_TEXT_TASK_NAMES = [
71
+ CAPTION_TASK_NAME,
72
+ DETAILED_CAPTION_TASK_NAME,
73
+ MORE_DETAILED_CAPTION_TASK_NAME,
74
+ OCR_TASK_NAME
75
+ ]
76
+ IMAGE_PROMPT_TO_IMAGE_TASK_NAMES = [
77
+ REGION_TO_CATEGORY_TASK_NAME,
78
+ REGION_TO_DESCRIPTION_TASK_NAME
79
+ ]