BoyuNLP commited on
Commit
a3a16bd
Β·
verified Β·
1 Parent(s): e8ed4bd

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +5 -9
  2. app.py +377 -0
  3. app_legacy.py +128 -0
  4. requirements.txt +4 -0
  5. votes.json +0 -0
README.md CHANGED
@@ -1,14 +1,10 @@
1
  ---
2
- title: UGround V1 2B
3
- emoji: πŸ“š
4
- colorFrom: blue
5
  colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
- short_description: GUI visual grounding model
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: UGround-Qwen2-VL
3
+ emoji: πŸ’»
4
+ colorFrom: purple
5
  colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.6.0
8
  app_file: app.py
9
  pinned: false
10
+ ---
 
 
 
 
app.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ from datetime import datetime
4
+ import gradio as gr
5
+ import torch
6
+ import spaces
7
+ from PIL import Image, ImageDraw
8
+ from qwen_vl_utils import process_vision_info
9
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
10
+ import ast
11
+ import os
12
+ import numpy as np
13
+ from huggingface_hub import hf_hub_download, list_repo_files
14
+
15
+ # Define constants
16
+ DESCRIPTION = "[UGround Demo](https://osu-nlp-group.github.io/UGround/)"
17
+ _SYSTEM = "You are a very helpful assistant."
18
+ MIN_PIXELS = 256 * 28 * 28
19
+ MAX_PIXELS = 1344 * 1344
20
+
21
+ # Specify the model repository and destination folder
22
+ # https://huggingface.co/osunlp/UGround-V1-2B
23
+ model_repo = "osunlp/UGround-V1-2B"
24
+ destination_folder = "./UGround-V1-2B"
25
+
26
+ # Ensure the destination folder exists
27
+ os.makedirs(destination_folder, exist_ok=True)
28
+
29
+ # List all files in the repository
30
+ files = list_repo_files(repo_id=model_repo)
31
+
32
+ # Download each file to the destination folder
33
+ for file in files:
34
+ file_path = hf_hub_download(repo_id=model_repo, filename=file, local_dir=destination_folder)
35
+ print(f"Downloaded {file} to {file_path}")
36
+
37
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
38
+ destination_folder,
39
+ torch_dtype=torch.bfloat16,
40
+ device_map="cpu",
41
+ )
42
+
43
+ # Load the processor
44
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
45
+
46
+ # Helper functions
47
+ def draw_point(image_input, point=None, radius=5):
48
+ """Draw a point on the image."""
49
+ if isinstance(image_input, str):
50
+ image = Image.open(image_input)
51
+ else:
52
+ image = Image.fromarray(np.uint8(image_input))
53
+
54
+ if point:
55
+ x, y = round(point[0]/1000 * image.width), round(point[1]/1000 * image.height)
56
+ ImageDraw.Draw(image).ellipse((x - radius, y - radius, x + radius, y + radius), fill='red')
57
+ return image
58
+
59
+ def array_to_image_path(image_array, session_id):
60
+ """Save the uploaded image and return its path."""
61
+ if image_array is None:
62
+ raise ValueError("No image provided. Please upload an image before submitting.")
63
+ img = Image.fromarray(np.uint8(image_array))
64
+ filename = f"{session_id}.png"
65
+ img.save(filename)
66
+ return os.path.abspath(filename)
67
+
68
+ def crop_image(image_path, click_xy, crop_factor=0.5):
69
+ """Crop the image around the click point."""
70
+ image = Image.open(image_path)
71
+ width, height = image.size
72
+ crop_width, crop_height = int(width * crop_factor), int(height * crop_factor)
73
+
74
+ center_x, center_y = int(click_xy[0] * width), int(click_xy[1] * height)
75
+ left = max(center_x - crop_width // 2, 0)
76
+ upper = max(center_y - crop_height // 2, 0)
77
+ right = min(center_x + crop_width // 2, width)
78
+ lower = min(center_y + crop_height // 2, height)
79
+
80
+ cropped_image = image.crop((left, upper, right, lower))
81
+ cropped_image_path = f"cropped_{os.path.basename(image_path)}"
82
+ cropped_image.save(cropped_image_path)
83
+
84
+ return cropped_image_path
85
+
86
+ @spaces.GPU
87
+ def run_showui(image, query, session_id, iterations=1):
88
+ """Main function for iterative inference."""
89
+ image_path = array_to_image_path(image, session_id)
90
+
91
+ click_xy = None
92
+ images_during_iterations = [] # List to store images at each step
93
+
94
+ for _ in range(iterations):
95
+ messages = [
96
+ {
97
+ "role": "user",
98
+ "content": [
99
+ {"type": "text", "text": "You are a very helpful assistant"},
100
+ {"type": "image", "image": image_path, "min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS},
101
+ {"type": "text", "text": f"""Your task is to help the user identify the precise coordinates (x, y) of a specific area/element/object on the screen based on a description.
102
+
103
+ - Your response should aim to point to the center or a representative point within the described area/element/object as accurately as possible.
104
+ - If the description is unclear or ambiguous, infer the most relevant area or element based on its likely context or purpose.
105
+ - Your answer should be a single string (x, y) corresponding to the point of the interest.
106
+
107
+ Description: {query}
108
+
109
+ Answer:"""}
110
+ ],
111
+ }
112
+ ]
113
+
114
+ global model
115
+ model = model.to("cuda")
116
+
117
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
118
+ image_inputs, video_inputs = process_vision_info(messages)
119
+ inputs = processor(
120
+ text=[text],
121
+ images=image_inputs,
122
+ videos=video_inputs,
123
+ padding=True,
124
+ return_tensors="pt"
125
+ )
126
+ inputs = inputs.to("cuda")
127
+
128
+ generated_ids = model.generate(**inputs, max_new_tokens=128,temperature=0)
129
+ generated_ids_trimmed = [
130
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
131
+ ]
132
+ output_text = processor.batch_decode(
133
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
134
+ )[0]
135
+
136
+ click_xy = ast.literal_eval(output_text)
137
+
138
+ # Draw point on the current image
139
+ result_image = draw_point(image_path, click_xy, radius=10)
140
+ images_during_iterations.append(result_image) # Store the current image
141
+
142
+ # Crop the image for the next iteration
143
+ image_path = crop_image(image_path, click_xy)
144
+
145
+ return images_during_iterations, str(click_xy)
146
+
147
+ def save_and_upload_data(image, query, session_id, is_example_image, votes=None):
148
+ """Save the data to a JSON file and upload to S3."""
149
+ if is_example_image == "True":
150
+ return
151
+
152
+ votes = votes or {"upvotes": 0, "downvotes": 0}
153
+
154
+ # Save image locally
155
+ image_file_name = f"{session_id}.png"
156
+ image.save(image_file_name)
157
+
158
+ data = {
159
+ "image_path": image_file_name,
160
+ "query": query,
161
+ "votes": votes,
162
+ "timestamp": datetime.now().isoformat()
163
+ }
164
+
165
+ local_file_name = f"{session_id}.json"
166
+
167
+ with open(local_file_name, "w") as f:
168
+ json.dump(data, f)
169
+
170
+ return data
171
+
172
+ def update_vote(vote_type, session_id, is_example_image):
173
+ """Update the vote count and re-upload the JSON file."""
174
+ if is_example_image == "True":
175
+ return "Example image."
176
+
177
+ local_file_name = f"{session_id}.json"
178
+
179
+ with open(local_file_name, "r") as f:
180
+ data = json.load(f)
181
+
182
+ if vote_type == "upvote":
183
+ data["votes"]["upvotes"] += 1
184
+ elif vote_type == "downvote":
185
+ data["votes"]["downvotes"] += 1
186
+
187
+ with open(local_file_name, "w") as f:
188
+ json.dump(data, f)
189
+
190
+ return f"Thank you for your {vote_type}!"
191
+
192
+ with open("./assets/showui.png", "rb") as image_file:
193
+ base64_image = base64.b64encode(image_file.read()).decode("utf-8")
194
+
195
+
196
+ # [
197
+ # [f"{cur_dir}/amazon.jpg",f"Search bar at the top of the page"],
198
+ # [f"{cur_dir}/shopping.jpg", f"delete button for the second item in the cart list"],
199
+ # [f"{cur_dir}/ios.jpg", f"Open Maps"],
200
+ # [f"{cur_dir}/toggle.jpg", f"toggle button labeled by VPN"],
201
+ # [f"{cur_dir}/semantic.jpg", f"Home"],
202
+ # [f"{cur_dir}/accweather.jpg", f"Select May"],
203
+ # [f"{cur_dir}/arxiv.jpg", f"Home"],
204
+ # [f"{cur_dir}/arxiv.jpg", f"Edit the page"],
205
+ # [f"{cur_dir}/ios.jpg", f"icon at the top right corner"],
206
+ # [f"{cur_dir}/health.jpg", f"text labeled by 2023/11/26"],
207
+
208
+
209
+ examples = [
210
+ ["./examples/amazon.jpg", "Search bar at the top of the page", True],
211
+ ["./examples/shopping.jpg", "delete button for the second item in the cart list", True],
212
+ ["./examples/ios.jpg", "Open Maps", True],
213
+ ["./examples/toggle.jpg", "toggle button labeled by VPN", True],
214
+ ["./examples/semantic.jpg", "Home", True],
215
+ ["./examples/accweather.jpg", "Select May", True],
216
+ ["./examples/arxiv.jpg", "Home", True],
217
+ ["./examples/arxiv.jpg", "Edit the page", True],
218
+ ["./examples/ios.jpg", "icon at the top right corner", True],
219
+ ["./examples/health.jpg", "text labeled by 2023/11/26", True],
220
+ ["./examples/app_store.png", "Download Kindle.", True],
221
+ ["./examples/ios_setting.png", "Turn off Do not disturb.", True],
222
+ # ["./examples/apple_music.png", "Star to favorite.", True],
223
+ # ["./examples/map.png", "Boston.", True],
224
+ # ["./examples/wallet.png", "Scan a QR code.", True],
225
+ # ["./examples/word.png", "More shapes.", True],
226
+ # ["./examples/web_shopping.png", "Proceed to checkout.", True],
227
+ # ["./examples/web_forum.png", "Post my comment.", True],
228
+ # ["./examples/safari_google.png", "Click on search bar.", True],
229
+ ]
230
+
231
+ def build_demo(embed_mode, concurrency_count=1):
232
+ with gr.Blocks(title="UGround Demo", theme=gr.themes.Default()) as demo:
233
+ state_image_path = gr.State(value=None)
234
+ state_session_id = gr.State(value=None)
235
+
236
+ # if not embed_mode:
237
+ # gr.HTML(
238
+ # f"""
239
+ # <div style="text-align: center; margin-bottom: 20px;">
240
+ # <div style="display: flex; justify-content: center;">
241
+ # <img src="https://raw.githubusercontent.com/showlab/ShowUI/refs/heads/main/assets/showui.jpg" alt="ShowUI" width="320" style="margin-bottom: 10px;"/>
242
+ # </div>
243
+ # <p>ShowUI is a lightweight vision-language-action model for GUI agents.</p>
244
+ # <div style="display: flex; justify-content: center; gap: 15px; font-size: 20px;">
245
+ # <a href="https://huggingface.co/showlab/ShowUI-2B" target="_blank">
246
+ # <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-ShowUI--2B-blue" alt="model"/>
247
+ # </a>
248
+ # <a href="https://arxiv.org/abs/2411.17465" target="_blank">
249
+ # <img src="https://img.shields.io/badge/arXiv%20paper-2411.17465-b31b1b.svg" alt="arXiv"/>
250
+ # </a>
251
+ # <a href="https://github.com/showlab/ShowUI" target="_blank">
252
+ # <img src="https://img.shields.io/badge/GitHub-ShowUI-black" alt="GitHub"/>
253
+ # </a>
254
+ # </div>
255
+ # </div>
256
+ # """
257
+ # )
258
+
259
+ with gr.Row():
260
+ with gr.Column(scale=3):
261
+ imagebox = gr.Image(type="numpy", label="Input Screenshot", placeholder="""#Try UGround with screenshots!
262
+
263
+
264
+ Windows: [Win + Shift + S]
265
+ macOS: [Command + Shift + 3]
266
+
267
+ Then upload/paste from clipboard πŸ€—
268
+ """)
269
+
270
+ # Add a slider for iteration count
271
+ iteration_slider = gr.Slider(minimum=1, maximum=3, step=1, value=1, label="Refinement Steps")
272
+
273
+ textbox = gr.Textbox(
274
+ show_label=True,
275
+ placeholder="Enter a query (e.g., 'Click Nahant')",
276
+ label="Query",
277
+ )
278
+ submit_btn = gr.Button(value="Submit", variant="primary")
279
+
280
+ # Examples component
281
+ gr.Examples(
282
+ examples=[[e[0], e[1]] for e in examples],
283
+ inputs=[imagebox, textbox],
284
+ outputs=[textbox], # Only update the query textbox
285
+ examples_per_page=3,
286
+ )
287
+
288
+ # Add a hidden dropdown to pass the `is_example` flag
289
+ is_example_dropdown = gr.Dropdown(
290
+ choices=["True", "False"],
291
+ value="False",
292
+ visible=False,
293
+ label="Is Example Image",
294
+ )
295
+
296
+ def set_is_example(query):
297
+ # Find the example and return its `is_example` flag
298
+ for _, example_query, is_example in examples:
299
+ if query.strip() == example_query.strip():
300
+ return str(is_example) # Return as string for Dropdown compatibility
301
+ return "False"
302
+
303
+ textbox.change(
304
+ set_is_example,
305
+ inputs=[textbox],
306
+ outputs=[is_example_dropdown],
307
+ )
308
+
309
+ with gr.Column(scale=8):
310
+ output_gallery = gr.Gallery(label="Iterative Refinement", object_fit="contain", preview=True)
311
+ # output_gallery = gr.Gallery(label="Iterative Refinement")
312
+ gr.HTML(
313
+ """
314
+ <p><strong>Note:</strong> The <span style="color: red;">red point</span> on the output image represents the predicted clickable coordinates.</p>
315
+ """
316
+ )
317
+ output_coords = gr.Textbox(label="Final Clickable Coordinates")
318
+
319
+ gr.HTML(
320
+ """
321
+ <p><strong>πŸ€” Good or bad? Rate your experience to help us improve! ⬇️</strong></p>
322
+ """
323
+ )
324
+ with gr.Row(elem_id="action-buttons", equal_height=True):
325
+ upvote_btn = gr.Button(value="πŸ‘ Looks good!", variant="secondary")
326
+ downvote_btn = gr.Button(value="πŸ‘Ž Too bad!", variant="secondary")
327
+ clear_btn = gr.Button(value="πŸ—‘οΈ Clear", interactive=True)
328
+
329
+ def on_submit(image, query, iterations, is_example_image):
330
+ if image is None:
331
+ raise ValueError("No image provided. Please upload an image before submitting.")
332
+
333
+ session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
334
+
335
+ images_during_iterations, click_coords = run_showui(image, query, session_id, iterations)
336
+
337
+ save_and_upload_data(images_during_iterations[0], query, session_id, is_example_image)
338
+
339
+ return images_during_iterations, click_coords, session_id
340
+
341
+ submit_btn.click(
342
+ on_submit,
343
+ [imagebox, textbox, iteration_slider, is_example_dropdown],
344
+ [output_gallery, output_coords, state_session_id],
345
+ )
346
+
347
+ clear_btn.click(
348
+ lambda: (None, None, None, None),
349
+ inputs=None,
350
+ outputs=[imagebox, textbox, output_gallery, output_coords, state_session_id],
351
+ queue=False
352
+ )
353
+
354
+ upvote_btn.click(
355
+ lambda session_id, is_example_image: update_vote("upvote", session_id, is_example_image),
356
+ inputs=[state_session_id, is_example_dropdown],
357
+ outputs=[],
358
+ queue=False
359
+ )
360
+
361
+ downvote_btn.click(
362
+ lambda session_id, is_example_image: update_vote("downvote", session_id, is_example_image),
363
+ inputs=[state_session_id, is_example_dropdown],
364
+ outputs=[],
365
+ queue=False
366
+ )
367
+
368
+ return demo
369
+
370
+ if __name__ == "__main__":
371
+ demo = build_demo(embed_mode=False)
372
+ demo.queue(api_open=False).launch(
373
+ server_name="0.0.0.0",
374
+ server_port=7860,
375
+ ssr_mode=False,
376
+ debug=True,
377
+ )
app_legacy.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import gradio as gr
3
+ import torch
4
+ from PIL import Image, ImageDraw
5
+ from qwen_vl_utils import process_vision_info
6
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
7
+ import ast
8
+ import os
9
+ from datetime import datetime
10
+ import numpy as np
11
+
12
+ # Function to draw a point on the image
13
+ def draw_point(image_input, point=None, radius=5):
14
+ if isinstance(image_input, str):
15
+ image = Image.open(image_input)
16
+ else:
17
+ image = Image.fromarray(np.uint8(image_input))
18
+
19
+ if point:
20
+ x, y = point[0] * image.width, point[1] * image.height
21
+ ImageDraw.Draw(image).ellipse((x - radius, y - radius, x + radius, y + radius), fill='red')
22
+ return image
23
+
24
+ # Function to save the uploaded image and return its path
25
+ def array_to_image_path(image_array):
26
+ if image_array is None:
27
+ raise ValueError("No image provided. Please upload an image before submitting.")
28
+ img = Image.fromarray(np.uint8(image_array))
29
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
30
+ filename = f"image_{timestamp}.png"
31
+ img.save(filename)
32
+ return os.path.abspath(filename)
33
+
34
+ # Load the model
35
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
36
+ # "./showui-2b",
37
+ "/users/difei/siyuan/showui_demo/showui-2b",
38
+ torch_dtype=torch.bfloat16,
39
+ device_map="auto",
40
+ # verbose=True,
41
+ )
42
+
43
+ # Define minimum and maximum pixel thresholds
44
+ min_pixels = 256 * 28 * 28
45
+ max_pixels = 1344 * 28 * 28
46
+
47
+ # Load the processor
48
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
49
+
50
+ # Hugging Face Space description
51
+ DESCRIPTION = "[ShowUI-2B Demo](https://huggingface.co/showlab/ShowUI-2B)"
52
+
53
+ # Define the system instruction
54
+ _SYSTEM = "Based on the screenshot of the page, I give a text description and you give its corresponding location. The coordinate represents a clickable location [x, y] for an element, which is a relative coordinate on the screenshot, scaled from 0 to 1."
55
+
56
+ # Define the main function for inference
57
+ def run_showui(image, query):
58
+ image_path = array_to_image_path(image)
59
+
60
+ messages = [
61
+ {
62
+ "role": "user",
63
+ "content": [
64
+ {"type": "text", "text": _SYSTEM},
65
+ {"type": "image", "image": image_path, "min_pixels": min_pixels, "max_pixels": max_pixels},
66
+ {"type": "text", "text": query}
67
+ ],
68
+ }
69
+ ]
70
+
71
+ # Prepare inputs for the model
72
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
73
+ image_inputs, video_inputs = process_vision_info(messages)
74
+ inputs = processor(
75
+ text=[text],
76
+ images=image_inputs,
77
+ videos=video_inputs,
78
+ padding=True,
79
+ return_tensors="pt"
80
+ )
81
+ inputs = inputs.to("cuda")
82
+
83
+ # Generate output
84
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
85
+ generated_ids_trimmed = [
86
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
87
+ ]
88
+ output_text = processor.batch_decode(
89
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
90
+ )[0]
91
+
92
+ # Parse the output into coordinates
93
+ click_xy = ast.literal_eval(output_text)
94
+
95
+ # Draw the point on the image
96
+ result_image = draw_point(image_path, click_xy, radius=10)
97
+ return result_image, str(click_xy)
98
+
99
+ with open("./assets/showui.png", "rb") as image_file:
100
+ base64_image = base64.b64encode(image_file.read()).decode("utf-8")
101
+
102
+ # Gradio UI
103
+ with gr.Blocks() as demo:
104
+ gr.HTML(
105
+ f"""
106
+ <div style="text-align: center; margin-bottom: 20px;">
107
+ <a href="https://github.com/showlab/ShowUI" target="_blank">
108
+ <img src="data:image/png;base64,{base64_image}" alt="ShowUI Logo" style="width: 200px; height: auto;"/>
109
+ </a>
110
+ </div>
111
+ """
112
+ )
113
+
114
+ gr.Markdown(DESCRIPTION)
115
+ with gr.Tab(label="ShowUI-2B Input"):
116
+ with gr.Row():
117
+ with gr.Column():
118
+ input_img = gr.Image(label="Input Screenshot")
119
+ text_input = gr.Textbox(label="Query (e.g., 'Click Nahant')")
120
+ submit_btn = gr.Button(value="Submit")
121
+ with gr.Column():
122
+ output_img = gr.Image(label="Output Image")
123
+ output_coords = gr.Textbox(label="Clickable Coordinates")
124
+
125
+ submit_btn.click(run_showui, [input_img, text_input], [output_img, output_coords])
126
+
127
+ demo.queue(api_open=False)
128
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ qwen-vl-utils==0.0.8
2
+ torchvision
3
+ transformers
4
+ accelerate
votes.json ADDED
File without changes