ClownRat commited on
Commit
cf69be0
Β·
1 Parent(s): 03c1c79

Update demos.

Browse files
app.py CHANGED
@@ -92,7 +92,7 @@ class Chat:
92
 
93
  @spaces.GPU(duration=120)
94
  @torch.inference_mode()
95
- def generate(self, tensor: list, modals: list, prompt: str, first_run: bool, state):
96
  # TODO: support multiple turns of conversation.
97
  assert len(tensor) == len(modals)
98
 
@@ -117,8 +117,9 @@ class Chat:
117
  images_or_videos=tensor,
118
  modal_list=modals,
119
  do_sample=True,
120
- temperature=0.2,
121
- max_new_tokens=1024,
 
122
  use_cache=True,
123
  stopping_criteria=[stopping_criteria],
124
  )
@@ -129,7 +130,7 @@ class Chat:
129
 
130
 
131
  @spaces.GPU(duration=120)
132
- def generate(image, video, first_run, state, state_, textbox_in, dtype=torch.float16):
133
  flag = 1
134
  if not textbox_in:
135
  if len(state_.messages) > 0:
@@ -174,7 +175,7 @@ def generate(image, video, first_run, state, state_, textbox_in, dtype=torch.flo
174
  if os.path.exists(image) and os.path.exists(video):
175
  text_en_in = text_en_in.replace(DEFAULT_MMODAL_TOKEN['VIDEO'], '').strip()
176
  text_en_in = DEFAULT_MMODAL_TOKEN['VIDEO'] + '\n' + text_en_in
177
- text_en_out, state_ = handler.generate(tensor, modals, text_en_in, first_run=first_run, state=state_)
178
  state_.messages[-1] = (state_.roles[1], text_en_out)
179
 
180
  text_en_out = text_en_out.split('#')[0]
@@ -236,12 +237,65 @@ with gr.Blocks(title='VideoLLaMA 2 πŸ”₯πŸš€πŸ”₯', theme=gr.themes.Default(primar
236
  image = gr.Image(label="Input Image", type="filepath")
237
  video = gr.Video(label="Input Video")
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  cur_dir = os.path.dirname(os.path.abspath(__file__))
240
  gr.Examples(
241
  examples=[
242
  [
243
  f"{cur_dir}/examples/extreme_ironing.jpg",
244
- "What is unusual about this image?",
245
  ],
246
  [
247
  f"{cur_dir}/examples/waterview.jpg",
@@ -254,28 +308,30 @@ with gr.Blocks(title='VideoLLaMA 2 πŸ”₯πŸš€πŸ”₯', theme=gr.themes.Default(primar
254
  ],
255
  inputs=[image, textbox],
256
  )
257
-
258
- with gr.Column(scale=7):
259
- chatbot = gr.Chatbot(label="VideoLLaMA 2", bubble_full_width=True, height=750)
260
- with gr.Row():
261
- with gr.Column(scale=8):
262
- textbox.render()
263
- with gr.Column(scale=1, min_width=50):
264
- submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
265
- with gr.Row(elem_id="buttons") as button_row:
266
- upvote_btn = gr.Button(value="πŸ‘ Upvote", interactive=True)
267
- downvote_btn = gr.Button(value="πŸ‘Ž Downvote", interactive=True)
268
- # flag_btn = gr.Button(value="⚠️ Flag", interactive=True)
269
- # stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
270
- regenerate_btn = gr.Button(value="πŸ”„ Regenerate", interactive=True)
271
- clear_btn = gr.Button(value="πŸ—‘οΈ Clear history", interactive=True)
 
 
272
 
273
  gr.Markdown(tos_markdown)
274
  gr.Markdown(learn_more_markdown)
275
 
276
  submit_btn.click(
277
  generate,
278
- [image, video, first_run, state, state_, textbox],
279
  [image, video, chatbot, first_run, state, state_, textbox])
280
 
281
  regenerate_btn.click(
@@ -283,7 +339,7 @@ with gr.Blocks(title='VideoLLaMA 2 πŸ”₯πŸš€πŸ”₯', theme=gr.themes.Default(primar
283
  [state, state_, textbox],
284
  [state, state_, textbox, chatbot, first_run]).then(
285
  generate,
286
- [image, video, first_run, state, state_, textbox],
287
  [image, video, chatbot, first_run, state, state_, textbox])
288
 
289
  clear_btn.click(
 
92
 
93
  @spaces.GPU(duration=120)
94
  @torch.inference_mode()
95
+ def generate(self, tensor: list, modals: list, prompt: str, first_run: bool, state, temperature, top_p, max_output_tokens):
96
  # TODO: support multiple turns of conversation.
97
  assert len(tensor) == len(modals)
98
 
 
117
  images_or_videos=tensor,
118
  modal_list=modals,
119
  do_sample=True,
120
+ temperature=temperature,
121
+ top_p=top_p,
122
+ max_new_tokens=max_output_tokens,
123
  use_cache=True,
124
  stopping_criteria=[stopping_criteria],
125
  )
 
130
 
131
 
132
  @spaces.GPU(duration=120)
133
+ def generate(image, video, first_run, state, state_, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
134
  flag = 1
135
  if not textbox_in:
136
  if len(state_.messages) > 0:
 
175
  if os.path.exists(image) and os.path.exists(video):
176
  text_en_in = text_en_in.replace(DEFAULT_MMODAL_TOKEN['VIDEO'], '').strip()
177
  text_en_in = DEFAULT_MMODAL_TOKEN['VIDEO'] + '\n' + text_en_in
178
+ text_en_out, state_ = handler.generate(tensor, modals, text_en_in, first_run=first_run, state=state_, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
179
  state_.messages[-1] = (state_.roles[1], text_en_out)
180
 
181
  text_en_out = text_en_out.split('#')[0]
 
237
  image = gr.Image(label="Input Image", type="filepath")
238
  video = gr.Video(label="Input Video")
239
 
240
+ with gr.Accordion("Parameters", open=False) as parameter_row:
241
+ # num_beams = gr.Slider(
242
+ # minimum=1,
243
+ # maximum=10,
244
+ # value=1,
245
+ # step=1,
246
+ # interactive=True,
247
+ # label="beam search numbers",
248
+ # )
249
+
250
+ temperature = gr.Slider(
251
+ minimum=0.1,
252
+ maximum=1.0,
253
+ value=0.2,
254
+ step=0.1,
255
+ interactive=True,
256
+ label="Temperature",
257
+ )
258
+
259
+ top_p = gr.Slider(
260
+ minimum=0.0,
261
+ maximum=1.0,
262
+ value=0.7,
263
+ step=0.1,
264
+ interactive=True,
265
+ label="Top P",
266
+ )
267
+
268
+ max_output_tokens = gr.Slider(
269
+ minimum=64,
270
+ maximum=1024,
271
+ value=512,
272
+ step=64,
273
+ interactive=True,
274
+ label="Max output tokens",
275
+ )
276
+
277
+ with gr.Column(scale=7):
278
+ chatbot = gr.Chatbot(label="VideoLLaMA 2", bubble_full_width=True, height=750)
279
+ with gr.Row():
280
+ with gr.Column(scale=8):
281
+ textbox.render()
282
+ with gr.Column(scale=1, min_width=50):
283
+ submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
284
+ with gr.Row(elem_id="buttons") as button_row:
285
+ upvote_btn = gr.Button(value="πŸ‘ Upvote", interactive=True)
286
+ downvote_btn = gr.Button(value="πŸ‘Ž Downvote", interactive=True)
287
+ # flag_btn = gr.Button(value="⚠️ Flag", interactive=True)
288
+ # stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
289
+ regenerate_btn = gr.Button(value="πŸ”„ Regenerate", interactive=True)
290
+ clear_btn = gr.Button(value="πŸ—‘οΈ Clear history", interactive=True)
291
+
292
+ with gr.Column():
293
  cur_dir = os.path.dirname(os.path.abspath(__file__))
294
  gr.Examples(
295
  examples=[
296
  [
297
  f"{cur_dir}/examples/extreme_ironing.jpg",
298
+ "What is the phone recording?",
299
  ],
300
  [
301
  f"{cur_dir}/examples/waterview.jpg",
 
308
  ],
309
  inputs=[image, textbox],
310
  )
311
+ gr.Examples(
312
+ examples=[
313
+ [
314
+ f"{cur_dir}/examples/rap.mp4",
315
+ "What happens in this video?",
316
+ ],
317
+ [
318
+ f"{cur_dir}/examples/demo2.mp4",
319
+ "Do you think it's morning or night in this video? Why?",
320
+ ],
321
+ [
322
+ f"{cur_dir}/examples/demo3.mp4",
323
+ "At the intersection, in which direction does the red car turn?",
324
+ ],
325
+ ],
326
+ inputs=[video, textbox],
327
+ )
328
 
329
  gr.Markdown(tos_markdown)
330
  gr.Markdown(learn_more_markdown)
331
 
332
  submit_btn.click(
333
  generate,
334
+ [image, video, first_run, state, state_, textbox, temperature, top_p, max_output_tokens],
335
  [image, video, chatbot, first_run, state, state_, textbox])
336
 
337
  regenerate_btn.click(
 
339
  [state, state_, textbox],
340
  [state, state_, textbox, chatbot, first_run]).then(
341
  generate,
342
+ [image, video, first_run, state, state_, textbox, temperature, top_p, max_output_tokens],
343
  [image, video, chatbot, first_run, state, state_, textbox])
344
 
345
  clear_btn.click(
examples/{1034346401.mp4 β†’ demo2.mp4} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08b62a634fe49edc0a19fc53f6ea5cfb345d9b2a6a7047811344c16832dc42b2
3
- size 1678095
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aec475bc837a1372f0b1c9ccea2c0c293f8d90f3f381f68f0691964d6d48fdca
3
+ size 3292167
examples/{sample_demo_1.mp4 β†’ demo3.mp4} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc6562a172eb9cb3c760a3c9992349c1faa2c793c112b7b9e50bd5cb17c2164d
3
- size 1549315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8783c215899aea0324b74bc4254b105bc2aa1759080dca0eb8166b2405e8cd5
3
+ size 4527999
examples/{sample_demo_3.mp4 β†’ rap.mp4} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da6126bce64c64a3d6f7ce889fbe15b5f1c2e3f978846351d8c7a79a950b429e
3
- size 463547
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cde3c54b5e9aba7cf77c6530684a4bba45f661dd1ab664043375bf9582196200
3
+ size 13779546
examples/sample_demo_9.mp4 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9702694f185e27ae016b85024b367e140cf93a4e3124d072816fd32f2ca0d96
3
- size 631864