eggarsway commited on
Commit
14b0bd5
Β·
1 Parent(s): 26a8b55

progressbar

Browse files
TrailBlazer/Pipeline/TextToVideoSDPipelineCall.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import inspect
2
  from typing import Any, Callable, Dict, List, Optional, Union
3
 
@@ -62,6 +63,7 @@ def text_to_video_sd_pipeline_call(
62
  callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
63
  callback_steps: int = 1,
64
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
 
65
  ):
66
  r"""
67
  The call function to the pipeline for generation.
@@ -251,7 +253,13 @@ def text_to_video_sd_pipeline_call(
251
  latents_at_steps = []
252
 
253
  with self.progress_bar(total=num_inference_steps) as progress_bar:
254
- for i, t in enumerate(timesteps):
 
 
 
 
 
 
255
  # expand the latents if we are doing classifier free guidance
256
  latent_model_input = (
257
  torch.cat([latents] * 2) if do_classifier_free_guidance else latents
@@ -318,6 +326,7 @@ def text_to_video_sd_pipeline_call(
318
  progress_bar.update()
319
  if callback is not None and i % callback_steps == 0:
320
  callback(i, t, latents)
 
321
 
322
  if output_type == "latent":
323
  return TextToVideoSDPipelineOutput(frames=latents)
 
1
+ import tqdm
2
  import inspect
3
  from typing import Any, Callable, Dict, List, Optional, Union
4
 
 
63
  callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
64
  callback_steps: int = 1,
65
  cross_attention_kwargs: Optional[Dict[str, Any]] = None,
66
+ progress = None,
67
  ):
68
  r"""
69
  The call function to the pipeline for generation.
 
253
  latents_at_steps = []
254
 
255
  with self.progress_bar(total=num_inference_steps) as progress_bar:
256
+
257
+ if type(progress)!=type(None):
258
+ timesteps = progress.tqdm(timesteps, desc="Processing")
259
+
260
+ i = 0
261
+ for t in timesteps:
262
+
263
  # expand the latents if we are doing classifier free guidance
264
  latent_model_input = (
265
  torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
326
  progress_bar.update()
327
  if callback is not None and i % callback_steps == 0:
328
  callback(i, t, latents)
329
+ i += 1
330
 
331
  if output_type == "latent":
332
  return TextToVideoSDPipelineOutput(frames=latents)
app.py CHANGED
@@ -1,5 +1,6 @@
1
- import spaces
2
  import sys
 
3
  import os
4
  import torch
5
  import gradio as gr
@@ -15,6 +16,7 @@ static = os.path.join(root, "static")
15
  from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
16
  from diffusers.pipelines import TextToVideoSDPipeline
17
  from diffusers.utils import export_to_video
 
18
  from TrailBlazer.Misc import ConfigIO
19
  from TrailBlazer.Misc import Logger as log
20
  from TrailBlazer.Pipeline.TextToVideoSDPipelineCall import (
@@ -31,16 +33,14 @@ unet3d_condition_model_forward_copy = UNet3DConditionModel.forward
31
  UNet3DConditionModel.forward = unet3d_condition_model_forward
32
 
33
 
34
- from diffusers.utils import export_to_video
35
 
36
  model_id = "cerspense/zeroscope_v2_576w"
37
- model_path = model_id
38
  pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
39
  pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
40
- #pipe.enable_model_cpu_offload()
41
  pipe.to('cuda')
42
 
43
- @spaces.GPU(duration=100)
44
  def core(bundle):
45
  generator = torch.Generator().manual_seed(int(bundle["seed"]))
46
  result = pipe(
@@ -49,6 +49,7 @@ def core(bundle):
49
  width=512,
50
  generator=generator,
51
  num_inference_steps=40,
 
52
  )
53
  return result.frames
54
 
@@ -70,6 +71,7 @@ def gen_btn_fn(
70
  temporal_strengthen_scale,
71
  temporal_weaken_scale,
72
  rand_seed,
 
73
  ):
74
 
75
  bundle = {}
@@ -100,8 +102,6 @@ def gen_btn_fn(
100
  bboxes.pop()
101
  prompts.pop()
102
 
103
-
104
-
105
  for i in range(len(frames)):
106
  keyframe = {}
107
  keyframe["bbox_ratios"] = [float(v) for v in bboxes[i].split(",")]
@@ -208,10 +208,15 @@ with gr.Blocks(
208
  <li>Basic: The bounding box (bbox) is the tuple of four floats for the rectangular corners: left, top, right, bottom in the normalized ratio. The Word prompt indices is a list of 1-indexed numbers determining the prompt word.</li>
209
  <li>Advanced Options: We also offer some key parameters to adjust the synthesis result. Please see our paper for more information about the ablations.</li>
210
  </ul>
 
 
211
  </p>
212
  """
213
- gr.HTML(description)
214
 
 
 
 
 
215
  with gr.Row():
216
  with gr.Column(scale=2):
217
  with gr.Row():
@@ -227,14 +232,16 @@ with gr.Blocks(
227
  word_prompt_indices_tb = gr.Textbox(
228
  interactive=True, label="Word prompt indices:"
229
  )
230
- text = "Hint: Each keyframe ends with <strong>SEMICOLON</strong>, and <strong>COMMA</strong> for separating each value in the keyframe. The prompt field can be a single prompt without semicolon, or multiple prompts ended semicolon. One can use the SketchPadHelper tab to help to design the bboxes field."
231
  gr.HTML(text)
232
  with gr.Row():
233
  clear_btn = gr.Button(value="Clear")
234
  gen_btn = gr.Button(value="Generate")
235
 
236
  with gr.Accordion("Advanced Options", open=False):
237
- text = "Hint: This default value should be sufficient for most tasks. However, it's important to note that our approach is currently implemented on ZeroScope, and its performance may be influenced by the model's characteristics. We plan to conduct experiments on different models in the future."
 
 
238
  gr.HTML(text)
239
  with gr.Row():
240
  trailing_length = gr.Slider(
@@ -321,7 +328,7 @@ with gr.Blocks(
321
  out_board_cb, inputs=[user_board], outputs=[out_board]
322
  )
323
  with gr.Row():
324
- text = "Hint: Utilize a black pen with the Draw Button to create a ``rough'' bbox. When you press the green ``Save Changes'' Button, the app calculates the minimum and maximum boundaries. Each ``Layer'', located at the bottom left of the pad, corresponds to one bounding box. Copy the returned value to the bbox textfield in the main tab."
325
  gr.HTML(text)
326
  with gr.Row():
327
  out_label = gr.Label(label="Converted bboxes string")
@@ -331,7 +338,7 @@ with gr.Blocks(
331
 
332
  with gr.Column(scale=1):
333
  gr.HTML(
334
- '<span style="font-size: 20px; font-weight: bold">Generated Images</span>'
335
  )
336
  with gr.Row():
337
  out_gen_1 = gr.Video(visible=True, show_label=False)
@@ -340,11 +347,12 @@ with gr.Blocks(
340
  gr.Examples(
341
  examples=[
342
  [
343
- "A clown fish swimming in a coral reef",
344
  "0.5,0.35,1.0,0.65; 0.0,0.35,0.5,0.65;",
345
  "0; 24;",
346
- "1,2,3",
347
  "123451232531",
 
348
  "assets/gradio/fish-RL.mp4",
349
  ],
350
  [
@@ -352,8 +360,9 @@ with gr.Blocks(
352
  "0.0,0.35,0.4,0.65; 0.6,0.35,1.0,0.65; 0.0,0.35,0.4,0.65;"
353
  "0.6,0.35,1.0,0.65; 0.0,0.35,0.4,0.65;",
354
  "0; 6; 12; 18; 24;",
355
- "1,2",
356
  "123451232530",
 
357
  "assets/gradio/cat-LRLR.mp4",
358
  ],
359
  [
@@ -362,6 +371,7 @@ with gr.Blocks(
362
  "0; 24;",
363
  "1, 2",
364
  "0",
 
365
  "assets/gradio/fish-TL2BR.mp4"
366
  ],
367
  [
@@ -370,6 +380,7 @@ with gr.Blocks(
370
  "0; 24;",
371
  "1, 2",
372
  "0",
 
373
  "assets/gradio/tiger-TL2BR.mp4"
374
  ],
375
  [
@@ -378,10 +389,11 @@ with gr.Blocks(
378
  "0; 24;",
379
  "1,2,3",
380
  "123451232531",
 
381
  "assets/gradio/Cat2Dog.mp4",
382
  ],
383
  ],
384
- inputs=[text_prompt_tb, bboxes_tb, frame_tb, word_prompt_indices_tb, rand_seed,out_gen_1],
385
  outputs=None,
386
  fn=None,
387
  cache_examples=False,
@@ -411,8 +423,7 @@ with gr.Blocks(
411
  rand_seed,
412
  ],
413
  outputs=[out_gen_1],
414
- queue=False,
415
  )
416
 
417
- main.queue(max_size=40, api_open=False)
418
- main.launch(max_threads=400)
 
1
+ #import spaces
2
  import sys
3
+ import time
4
  import os
5
  import torch
6
  import gradio as gr
 
16
  from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
17
  from diffusers.pipelines import TextToVideoSDPipeline
18
  from diffusers.utils import export_to_video
19
+
20
  from TrailBlazer.Misc import ConfigIO
21
  from TrailBlazer.Misc import Logger as log
22
  from TrailBlazer.Pipeline.TextToVideoSDPipelineCall import (
 
33
  UNet3DConditionModel.forward = unet3d_condition_model_forward
34
 
35
 
 
36
 
37
  model_id = "cerspense/zeroscope_v2_576w"
38
+ model_path = sys.argv[-1] + model_id
39
  pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
40
  pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 
41
  pipe.to('cuda')
42
 
43
+ #@spaces.GPU(duration=120)
44
  def core(bundle):
45
  generator = torch.Generator().manual_seed(int(bundle["seed"]))
46
  result = pipe(
 
49
  width=512,
50
  generator=generator,
51
  num_inference_steps=40,
52
+ progress=gr.Progress(track_tqdm=True),
53
  )
54
  return result.frames
55
 
 
71
  temporal_strengthen_scale,
72
  temporal_weaken_scale,
73
  rand_seed,
74
+ progress = gr.Progress(),
75
  ):
76
 
77
  bundle = {}
 
102
  bboxes.pop()
103
  prompts.pop()
104
 
 
 
105
  for i in range(len(frames)):
106
  keyframe = {}
107
  keyframe["bbox_ratios"] = [float(v) for v in bboxes[i].split(",")]
 
208
  <li>Basic: The bounding box (bbox) is the tuple of four floats for the rectangular corners: left, top, right, bottom in the normalized ratio. The Word prompt indices is a list of 1-indexed numbers determining the prompt word.</li>
209
  <li>Advanced Options: We also offer some key parameters to adjust the synthesis result. Please see our paper for more information about the ablations.</li>
210
  </ul>
211
+
212
+ For your initial use, it is advisable to select one of the examples provided below and attempt to swap the subject first (e.g., cat -> lion). Subsequently, define the keyframe with the associated bbox/frame/prompt. Please note that our current work is based on the ZeroScope (cerspense/zeroscope_v2_576w) model. Using prompts that are commonly recognized in the ZeroScope model context is recommended.
213
  </p>
214
  """
 
215
 
216
+ gr.HTML(description)
217
+ dummy_note = gr.Textbox(
218
+ interactive=True, label="Note", visible=False
219
+ )
220
  with gr.Row():
221
  with gr.Column(scale=2):
222
  with gr.Row():
 
232
  word_prompt_indices_tb = gr.Textbox(
233
  interactive=True, label="Word prompt indices:"
234
  )
235
+ text = "<strong>Hint</strong>: Each keyframe ends with <strong>SEMICOLON</strong>, and <strong>COMMA</strong> for separating each value in the keyframe. The prompt field can be a single prompt without semicolon, or multiple prompts ended semicolon. One can use the SketchPadHelper tab to help to design the bboxes field."
236
  gr.HTML(text)
237
  with gr.Row():
238
  clear_btn = gr.Button(value="Clear")
239
  gen_btn = gr.Button(value="Generate")
240
 
241
  with gr.Accordion("Advanced Options", open=False):
242
+ text = "<strong>Hint</strong>: This default value should be sufficient for most tasks. However, it's important to note that our approach is currently implemented on ZeroScope, and its performance may be influenced by the model's characteristics. We plan to conduct experiments on different models in the future."
243
+ gr.HTML(text)
244
+ text = "<strong>Hint</strong>: When the #Spatial edits and #Temporal edits sliders are 0, it means the experiment will run without TrailBlazer but just simply a T2V generation through ZeroScope."
245
  gr.HTML(text)
246
  with gr.Row():
247
  trailing_length = gr.Slider(
 
328
  out_board_cb, inputs=[user_board], outputs=[out_board]
329
  )
330
  with gr.Row():
331
+ text = "<strong>Hint</strong>: Utilize a black pen with the Draw Button to create a ``rough'' bbox. When you press the green ``Save Changes'' Button, the app calculates the minimum and maximum boundaries. Each ``Layer'', located at the bottom left of the pad, corresponds to one bounding box. Copy the returned value to the bbox textfield in the main tab."
332
  gr.HTML(text)
333
  with gr.Row():
334
  out_label = gr.Label(label="Converted bboxes string")
 
338
 
339
  with gr.Column(scale=1):
340
  gr.HTML(
341
+ '<span style="font-size: 20px; font-weight: bold">Generated Video</span>'
342
  )
343
  with gr.Row():
344
  out_gen_1 = gr.Video(visible=True, show_label=False)
 
347
  gr.Examples(
348
  examples=[
349
  [
350
+ "A clownfish swimming in a coral reef",
351
  "0.5,0.35,1.0,0.65; 0.0,0.35,0.5,0.65;",
352
  "0; 24;",
353
+ "1, 2",
354
  "123451232531",
355
+ "It generates clownfish at right, then move to left",
356
  "assets/gradio/fish-RL.mp4",
357
  ],
358
  [
 
360
  "0.0,0.35,0.4,0.65; 0.6,0.35,1.0,0.65; 0.0,0.35,0.4,0.65;"
361
  "0.6,0.35,1.0,0.65; 0.0,0.35,0.4,0.65;",
362
  "0; 6; 12; 18; 24;",
363
+ "1, 2",
364
  "123451232530",
365
+ "The cat will run Left/Right/Left/Right",
366
  "assets/gradio/cat-LRLR.mp4",
367
  ],
368
  [
 
371
  "0; 24;",
372
  "1, 2",
373
  "0",
374
+ "The fish moves from top left to bottom right, from far to near.",
375
  "assets/gradio/fish-TL2BR.mp4"
376
  ],
377
  [
 
380
  "0; 24;",
381
  "1, 2",
382
  "0",
383
+ "Same with the above but now the prompt associates with tiger",
384
  "assets/gradio/tiger-TL2BR.mp4"
385
  ],
386
  [
 
389
  "0; 24;",
390
  "1,2,3",
391
  "123451232531",
392
+ "The subject will deformed from cat to dog.",
393
  "assets/gradio/Cat2Dog.mp4",
394
  ],
395
  ],
396
+ inputs=[text_prompt_tb, bboxes_tb, frame_tb, word_prompt_indices_tb, rand_seed, dummy_note, out_gen_1],
397
  outputs=None,
398
  fn=None,
399
  cache_examples=False,
 
423
  rand_seed,
424
  ],
425
  outputs=[out_gen_1],
 
426
  )
427
 
428
+ main.launch()
429
+ # main.launch(max_threads=400)