tsqn commited on
Commit
056829f
·
verified ·
1 Parent(s): 3f09c6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +210 -646
app.py CHANGED
@@ -1,677 +1,241 @@
1
- import gradio as gr
2
- from gradio_toggle import Toggle
 
 
 
3
  import torch
4
- from huggingface_hub import snapshot_download
 
 
 
 
5
 
6
- from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
7
- from xora.models.transformers.transformer3d import Transformer3DModel
8
- from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
9
- from xora.schedulers.rf import RectifiedFlowScheduler
10
- from xora.pipelines.pipeline_xora_video import XoraVideoPipeline
11
  from transformers import T5EncoderModel, T5Tokenizer
12
- from xora.utils.conditioning_method import ConditioningMethod
13
  from pathlib import Path
14
- import safetensors.torch
15
- import json
16
- import numpy as np
17
- import cv2
18
- from PIL import Image
19
- import tempfile
20
- import os
21
- import gc
22
- from openai import OpenAI
23
-
24
- # Load Hugging Face token if needed
25
- hf_token = os.getenv("HF_TOKEN")
26
- openai_api_key = os.getenv("OPENAI_API_KEY")
27
- client = OpenAI(api_key=openai_api_key)
28
- system_prompt_t2v_path = "assets/system_prompt_t2v.txt"
29
- system_prompt_i2v_path = "assets/system_prompt_i2v.txt"
30
- with open(system_prompt_t2v_path, "r") as f:
31
- system_prompt_t2v = f.read()
32
-
33
- with open(system_prompt_i2v_path, "r") as f:
34
- system_prompt_i2v = f.read()
35
-
36
- # Set model download directory within Hugging Face Spaces
37
- model_path = "asset"
38
- if not os.path.exists(model_path):
39
- snapshot_download(
40
- "Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token
41
- )
42
-
43
- # Global variables to load components
44
- vae_dir = Path(model_path) / "vae"
45
- unet_dir = Path(model_path) / "unet"
46
- scheduler_dir = Path(model_path) / "scheduler"
47
-
48
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
-
50
-
51
- def load_vae(vae_dir):
52
- vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
53
- vae_config_path = vae_dir / "config.json"
54
- with open(vae_config_path, "r") as f:
55
- vae_config = json.load(f)
56
- vae = CausalVideoAutoencoder.from_config(vae_config)
57
- vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
58
- vae.load_state_dict(vae_state_dict)
59
- return vae.to(device=device, dtype=torch.bfloat16)
60
-
61
 
62
- def load_unet(unet_dir):
63
- unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
64
- unet_config_path = unet_dir / "config.json"
65
- transformer_config = Transformer3DModel.load_config(unet_config_path)
66
- transformer = Transformer3DModel.from_config(transformer_config)
67
- unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
68
- transformer.load_state_dict(unet_state_dict, strict=True)
69
- return transformer.to(device=device, dtype=torch.bfloat16)
70
 
 
 
 
 
 
71
 
72
- def load_scheduler(scheduler_dir):
73
- scheduler_config_path = scheduler_dir / "scheduler_config.json"
74
- scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
75
- return RectifiedFlowScheduler.from_config(scheduler_config)
76
 
 
77
 
78
- # Helper function for image processing
79
- def center_crop_and_resize(frame, target_height, target_width):
80
- h, w, _ = frame.shape
81
- aspect_ratio_target = target_width / target_height
82
- aspect_ratio_frame = w / h
83
- if aspect_ratio_frame > aspect_ratio_target:
84
- new_width = int(h * aspect_ratio_target)
85
- x_start = (w - new_width) // 2
86
- frame_cropped = frame[:, x_start : x_start + new_width]
87
- else:
88
- new_height = int(w / aspect_ratio_target)
89
- y_start = (h - new_height) // 2
90
- frame_cropped = frame[y_start : y_start + new_height, :]
91
- frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
92
- return frame_resized
93
-
94
-
95
- def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
96
- image = Image.open(image_path).convert("RGB")
97
- image_np = np.array(image)
98
- frame_resized = center_crop_and_resize(image_np, target_height, target_width)
99
- frame_tensor = torch.tensor(frame_resized).permute(2, 0, 1).float()
100
- frame_tensor = (frame_tensor / 127.5) - 1.0
101
- return frame_tensor.unsqueeze(0).unsqueeze(2)
102
-
103
-
104
- def enhance_prompt_if_enabled(prompt, enhance_toggle, type="t2v"):
105
- if not enhance_toggle:
106
- print("Enhance toggle is off, Prompt: ", prompt)
107
- return prompt
108
-
109
- system_prompt = system_prompt_t2v if type == "t2v" else system_prompt_i2v
110
- messages = [
111
- {"role": "system", "content": system_prompt},
112
- {"role": "user", "content": prompt},
113
- ]
114
-
115
- try:
116
- response = client.chat.completions.create(
117
- model="gpt-4o-mini",
118
- messages=messages,
119
- max_tokens=200,
120
- )
121
- print("Enhanced Prompt: ", response.choices[0].message.content.strip())
122
- return response.choices[0].message.content.strip()
123
- except Exception as e:
124
- print(f"Error: {e}")
125
- return prompt
126
-
127
-
128
- # Preset options for resolution and frame configuration
129
- preset_options = [
130
- {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41},
131
- {"label": "1088x704, 49 frames", "width": 1088, "height": 704, "num_frames": 49},
132
- {"label": "1056x640, 57 frames", "width": 1056, "height": 640, "num_frames": 57},
133
- {"label": "992x608, 65 frames", "width": 992, "height": 608, "num_frames": 65},
134
- {"label": "896x608, 73 frames", "width": 896, "height": 608, "num_frames": 73},
135
- {"label": "896x544, 81 frames", "width": 896, "height": 544, "num_frames": 81},
136
- {"label": "832x544, 89 frames", "width": 832, "height": 544, "num_frames": 89},
137
- {"label": "800x512, 97 frames", "width": 800, "height": 512, "num_frames": 97},
138
- {"label": "768x512, 97 frames", "width": 768, "height": 512, "num_frames": 97},
139
- {"label": "800x480, 105 frames", "width": 800, "height": 480, "num_frames": 105},
140
- {"label": "736x480, 113 frames", "width": 736, "height": 480, "num_frames": 113},
141
- {"label": "704x480, 121 frames", "width": 704, "height": 480, "num_frames": 121},
142
- {"label": "704x448, 129 frames", "width": 704, "height": 448, "num_frames": 129},
143
- {"label": "672x448, 137 frames", "width": 672, "height": 448, "num_frames": 137},
144
- {"label": "640x416, 153 frames", "width": 640, "height": 416, "num_frames": 153},
145
- {"label": "672x384, 161 frames", "width": 672, "height": 384, "num_frames": 161},
146
- {"label": "640x384, 169 frames", "width": 640, "height": 384, "num_frames": 169},
147
- {"label": "608x384, 177 frames", "width": 608, "height": 384, "num_frames": 177},
148
- {"label": "576x384, 185 frames", "width": 576, "height": 384, "num_frames": 185},
149
- {"label": "608x352, 193 frames", "width": 608, "height": 352, "num_frames": 193},
150
- {"label": "576x352, 201 frames", "width": 576, "height": 352, "num_frames": 201},
151
- {"label": "544x352, 209 frames", "width": 544, "height": 352, "num_frames": 209},
152
- {"label": "512x352, 225 frames", "width": 512, "height": 352, "num_frames": 225},
153
- {"label": "512x352, 233 frames", "width": 512, "height": 352, "num_frames": 233},
154
- {"label": "544x320, 241 frames", "width": 544, "height": 320, "num_frames": 241},
155
- {"label": "512x320, 249 frames", "width": 512, "height": 320, "num_frames": 249},
156
- {"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257},
157
- ]
158
-
159
-
160
- # Function to toggle visibility of sliders based on preset selection
161
- def preset_changed(preset):
162
- if preset != "Custom":
163
- selected = next(item for item in preset_options if item["label"] == preset)
164
- return (
165
- selected["height"],
166
- selected["width"],
167
- selected["num_frames"],
168
- gr.update(visible=False),
169
- gr.update(visible=False),
170
- gr.update(visible=False),
171
- )
172
- else:
173
- return (
174
- None,
175
- None,
176
- None,
177
- gr.update(visible=True),
178
- gr.update(visible=True),
179
- gr.update(visible=True),
180
- )
181
-
182
-
183
- # Load models
184
- vae = load_vae(vae_dir)
185
- unet = load_unet(unet_dir)
186
- scheduler = load_scheduler(scheduler_dir)
187
- patchifier = SymmetricPatchifier(patch_size=1)
188
- text_encoder = T5EncoderModel.from_pretrained(
189
- "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
190
- ).to(device)
191
- tokenizer = T5Tokenizer.from_pretrained(
192
- "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
193
- )
194
-
195
- pipeline = XoraVideoPipeline(
196
- transformer=unet,
197
- patchifier=patchifier,
198
- text_encoder=text_encoder,
199
- tokenizer=tokenizer,
200
- scheduler=scheduler,
201
- vae=vae,
202
- ).to(device)
203
-
204
-
205
- def generate_video_from_text(
206
- prompt="",
207
- enhance_prompt_toggle=False,
208
- negative_prompt="",
209
- frame_rate=25,
210
- seed=171198,
211
- num_inference_steps=30,
212
- guidance_scale=3,
213
- height=512,
214
- width=768,
215
- num_frames=121,
216
- progress=gr.Progress(),
217
- ):
218
- if len(prompt.strip()) < 50:
219
- raise gr.Error(
220
- "Prompt must be at least 50 characters long. Please provide more details for the best results.",
221
- duration=5,
222
- )
223
-
224
- prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
225
-
226
- sample = {
227
  "prompt": prompt,
228
- "prompt_attention_mask": None,
229
  "negative_prompt": negative_prompt,
230
- "negative_prompt_attention_mask": None,
231
- "media_items": None,
 
 
 
 
232
  }
 
 
233
 
234
- generator = torch.Generator(device="cpu").manual_seed(seed)
 
 
235
 
236
- def gradio_progress_callback(self, step, timestep, kwargs):
237
- progress((step + 1) / num_inference_steps)
 
238
 
239
- try:
240
- with torch.no_grad():
241
- images = pipeline(
242
- num_inference_steps=num_inference_steps,
243
- num_images_per_prompt=1,
244
- guidance_scale=guidance_scale,
245
- generator=generator,
246
- output_type="pt",
247
- height=height,
248
- width=width,
249
- num_frames=num_frames,
250
- frame_rate=frame_rate,
251
- **sample,
252
- is_video=True,
253
- vae_per_channel_normalize=True,
254
- conditioning_method=ConditioningMethod.UNCONDITIONAL,
255
- mixed_precision=True,
256
- callback_on_step_end=gradio_progress_callback,
257
- ).images
258
- except Exception as e:
259
- raise gr.Error(
260
- f"An error occurred while generating the video. Please try again. Error: {e}",
261
- duration=5,
262
- )
263
- finally:
264
- torch.cuda.empty_cache()
265
- gc.collect()
266
 
267
- output_path = tempfile.mktemp(suffix=".mp4")
268
- print(images.shape)
269
- video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
270
- video_np = (video_np * 255).astype(np.uint8)
271
- height, width = video_np.shape[1:3]
272
- out = cv2.VideoWriter(
273
- output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
274
- )
275
- for frame in video_np[..., ::-1]:
276
- out.write(frame)
277
- out.release()
278
- # Explicitly delete tensors and clear cache
279
- del images
280
- del video_np
281
- torch.cuda.empty_cache()
282
- return output_path
283
-
284
-
285
- def generate_video_from_image(
286
- image_path,
287
- prompt="",
288
- enhance_prompt_toggle=False,
289
- negative_prompt="",
290
- frame_rate=25,
291
- seed=171198,
292
- num_inference_steps=30,
293
- guidance_scale=3,
294
- height=512,
295
- width=768,
296
- num_frames=121,
297
- progress=gr.Progress(),
298
- ):
299
-
300
- print("Height: ", height)
301
- print("Width: ", width)
302
- print("Num Frames: ", num_frames)
303
-
304
- if len(prompt.strip()) < 50:
305
- raise gr.Error(
306
- "Prompt must be at least 50 characters long. Please provide more details for the best results.",
307
- duration=5,
308
- )
309
 
310
- if not image_path:
311
- raise gr.Error("Please provide an input image.", duration=5)
312
-
313
- media_items = (
314
- load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
315
- )
316
-
317
- prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
318
-
319
- sample = {
320
  "prompt": prompt,
321
- "prompt_attention_mask": None,
322
  "negative_prompt": negative_prompt,
323
- "negative_prompt_attention_mask": None,
324
- "media_items": media_items,
 
 
 
 
325
  }
326
-
327
- generator = torch.Generator(device="cpu").manual_seed(seed)
328
-
329
- def gradio_progress_callback(self, step, timestep, kwargs):
330
- progress((step + 1) / num_inference_steps)
331
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  try:
333
- with torch.no_grad():
334
- images = pipeline(
335
- num_inference_steps=num_inference_steps,
336
- num_images_per_prompt=1,
337
- guidance_scale=guidance_scale,
338
- generator=generator,
339
- output_type="pt",
340
- height=height,
341
- width=width,
342
- num_frames=num_frames,
343
- frame_rate=frame_rate,
344
- **sample,
345
- is_video=True,
346
- vae_per_channel_normalize=True,
347
- conditioning_method=ConditioningMethod.FIRST_FRAME,
348
- mixed_precision=True,
349
- callback_on_step_end=gradio_progress_callback,
350
- ).images
351
-
352
- output_path = tempfile.mktemp(suffix=".mp4")
353
- video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
354
- video_np = (video_np * 255).astype(np.uint8)
355
- height, width = video_np.shape[1:3]
356
- out = cv2.VideoWriter(
357
- output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
358
- )
359
- for frame in video_np[..., ::-1]:
360
- out.write(frame)
361
- out.release()
362
  except Exception as e:
363
- raise gr.Error(
364
- f"An error occurred while generating the video. Please try again. Error: {e}",
365
- duration=5,
366
- )
367
-
368
- finally:
369
- torch.cuda.empty_cache()
370
- gc.collect()
 
 
 
 
 
 
 
 
371
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  return output_path
373
 
 
 
374
 
375
- def create_advanced_options():
376
- with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
377
- seed = gr.Slider(
378
- label="4.1 Seed", minimum=0, maximum=1000000, step=1, value=171198
379
- )
380
- inference_steps = gr.Slider(
381
- label="4.2 Inference Steps", minimum=1, maximum=50, step=1, value=30
382
- )
383
- guidance_scale = gr.Slider(
384
- label="4.3 Guidance Scale", minimum=1.0, maximum=5.0, step=0.1, value=3.0
385
- )
386
-
387
- height_slider = gr.Slider(
388
- label="4.4 Height",
389
- minimum=256,
390
- maximum=1024,
391
- step=64,
392
- value=512,
393
- visible=False,
394
- )
395
- width_slider = gr.Slider(
396
- label="4.5 Width",
397
- minimum=256,
398
- maximum=1024,
399
- step=64,
400
- value=768,
401
- visible=False,
402
- )
403
- num_frames_slider = gr.Slider(
404
- label="4.5 Number of Frames",
405
- minimum=1,
406
- maximum=200,
407
- step=1,
408
- value=121,
409
- visible=False,
410
- )
411
-
412
- return [
413
- seed,
414
- inference_steps,
415
- guidance_scale,
416
- height_slider,
417
- width_slider,
418
- num_frames_slider,
419
- ]
420
-
421
-
422
- # Define the Gradio interface with tabs
423
- with gr.Blocks(theme=gr.themes.Soft()) as iface:
424
- with gr.Row(elem_id="title-row"):
425
- gr.Markdown(
426
- """
427
- <div style="text-align: center; margin-bottom: 1em">
428
- <h1 style="font-size: 2.5em; font-weight: 600; margin: 0.5em 0;">Video Generation with LTX Video</h1>
429
- </div>
430
- """
431
- )
432
- with gr.Row(elem_id="title-row"):
433
- gr.HTML( # add technical report link
434
- """
435
- <div style="display:flex;column-gap:4px;">
436
- <a href="https://github.com/Lightricks/LTX-Video">
437
- <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
438
- </a>
439
- <a href="https://github.com/Lightricks/ComfyUI-LTXVideo">
440
- <img src='https://img.shields.io/badge/GitHub-ComfyUI-blue'>
441
- </a>
442
- <a href="http://www.lightricks.com/ltxv">
443
- <img src="https://img.shields.io/badge/Project-Page-green" alt="Follow me on HF">
444
- </a>
445
- <a href="https://huggingface.co/spaces/Lightricks/LTX-Video-Playground?duplicate=true">
446
- <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
447
- </a>
448
- <a href="https://huggingface.co/Lightricks">
449
- <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/follow-me-on-HF-sm-dark.svg" alt="Follow me on HF">
450
- </a>
451
- </div>
452
- """
453
- )
454
- with gr.Accordion(
455
- " 📖 Tips for Best Results", open=False, elem_id="instructions-accordion"
456
- ):
457
- gr.Markdown(
458
- """
459
- 📝 Prompt Engineering
460
-
461
- When writing prompts, focus on detailed, chronological descriptions of actions and scenes. Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph. Start directly with the action, and keep descriptions literal and precise. Think like a cinematographer describing a shot list. Keep within 200 words.
462
- For best results, build your prompts using this structure:
463
-
464
- - Start with main action in a single sentence
465
- - Add specific details about movements and gestures
466
- - Describe character/object appearances precisely
467
- - Include background and environment details
468
- - Specify camera angles and movements
469
- - Describe lighting and colors
470
- - Note any changes or sudden events
471
-
472
- See examples for more inspiration.
473
-
474
- 🎮 Parameter Guide
475
-
476
- - Resolution Preset: Higher resolutions for detailed scenes, lower for faster generation and simpler scenes
477
- - Seed: Save seed values to recreate specific styles or compositions you like
478
- - Guidance Scale: 3-3.5 are the recommended values
479
- - Inference Steps: More steps (40+) for quality, fewer steps (20-30) for speed
480
- """
481
- )
482
-
483
  with gr.Tabs():
484
- # Text to Video Tab
485
- with gr.TabItem("Text to Video"):
486
  with gr.Row():
487
- with gr.Column():
488
- txt2vid_prompt = gr.Textbox(
489
- label="Step 1: Enter Your Prompt",
490
- placeholder="Describe the video you want to generate (minimum 50 characters)...",
491
- value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
492
- lines=5,
493
- )
494
- txt2vid_enhance_toggle = Toggle(
495
- label="Enhance Prompt",
496
- value=False,
497
- interactive=True,
498
- )
499
-
500
- txt2vid_negative_prompt = gr.Textbox(
501
- label="Step 2: Enter Negative Prompt",
502
- placeholder="Describe what you don't want in the video...",
503
- value="low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
504
- lines=2,
505
- )
506
-
507
- txt2vid_preset = gr.Dropdown(
508
- choices=[p["label"] for p in preset_options],
509
- value="768x512, 97 frames",
510
- label="Step 3.1: Choose Resolution Preset",
511
- )
512
-
513
- txt2vid_frame_rate = gr.Slider(
514
- label="Step 3.2: Frame Rate",
515
- minimum=21,
516
- maximum=30,
517
- step=1,
518
- value=25,
519
- )
520
-
521
- txt2vid_advanced = create_advanced_options()
522
- txt2vid_generate = gr.Button(
523
- "Step 5: Generate Video",
524
- variant="primary",
525
- size="lg",
526
- )
527
-
528
- with gr.Column():
529
- txt2vid_output = gr.Video(label="Generated Output")
530
-
531
  with gr.Row():
532
- gr.Examples(
533
- examples=[
534
- [
535
- "A young woman in a traditional Mongolian dress is peeking through a sheer white curtain, her face showing a mix of curiosity and apprehension. The woman has long black hair styled in two braids, adorned with white beads, and her eyes are wide with a hint of surprise. Her dress is a vibrant blue with intricate gold embroidery, and she wears a matching headband with a similar design. The background is a simple white curtain, which creates a sense of mystery and intrigue.ith long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair’s face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage",
536
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
537
- "assets/t2v_2.mp4",
538
- ],
539
- [
540
- "A young man with blond hair wearing a yellow jacket stands in a forest and looks around. He has light skin and his hair is styled with a middle part. He looks to the left and then to the right, his gaze lingering in each direction. The camera angle is low, looking up at the man, and remains stationary throughout the video. The background is slightly out of focus, with green trees and the sun shining brightly behind the man. The lighting is natural and warm, with the sun creating a lens flare that moves across the man’s face. The scene is captured in real-life footage.",
541
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
542
- "assets/t2v_1.mp4",
543
- ],
544
- [
545
- "A cyclist races along a winding mountain road. Clad in aerodynamic gear, he pedals intensely, sweat glistening on his brow. The camera alternates between close-ups of his determined expression and wide shots of the breathtaking landscape. Pine trees blur past, and the sky is a crisp blue. The scene is invigorating and competitive.",
546
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
547
- "assets/t2v_0.mp4",
548
- ],
549
- ],
550
- inputs=[txt2vid_prompt, txt2vid_negative_prompt, txt2vid_output],
551
- label="Example Text-to-Video Generations",
552
- )
553
-
554
- # Image to Video Tab
555
- with gr.TabItem("Image to Video"):
556
  with gr.Row():
557
- with gr.Column():
558
- img2vid_image = gr.Image(
559
- type="filepath",
560
- label="Step 1: Upload Input Image",
561
- elem_id="image_upload",
562
- )
563
- img2vid_prompt = gr.Textbox(
564
- label="Step 2: Enter Your Prompt",
565
- placeholder="Describe how you want to animate the image (minimum 50 characters)...",
566
- value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
567
- lines=5,
568
- )
569
- img2vid_enhance_toggle = Toggle(
570
- label="Enhance Prompt",
571
- value=False,
572
- interactive=True,
573
- )
574
- img2vid_negative_prompt = gr.Textbox(
575
- label="Step 3: Enter Negative Prompt",
576
- placeholder="Describe what you don't want in the video...",
577
- value="low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
578
- lines=2,
579
- )
580
-
581
- img2vid_preset = gr.Dropdown(
582
- choices=[p["label"] for p in preset_options],
583
- value="768x512, 97 frames",
584
- label="Step 3.1: Choose Resolution Preset",
585
- )
586
-
587
- img2vid_frame_rate = gr.Slider(
588
- label="Step 3.2: Frame Rate",
589
- minimum=21,
590
- maximum=30,
591
- step=1,
592
- value=25,
593
- )
594
-
595
- img2vid_advanced = create_advanced_options()
596
- img2vid_generate = gr.Button(
597
- "Step 6: Generate Video", variant="primary", size="lg"
598
- )
599
-
600
- with gr.Column():
601
- img2vid_output = gr.Video(label="Generated Output")
602
-
603
  with gr.Row():
604
- gr.Examples(
605
- examples=[
606
- [
607
- "assets/i2v_i2.png",
608
- "A woman stirs a pot of boiling water on a white electric burner. Her hands, with purple nail polish, hold a wooden spoon and move it in a circular motion within a white pot filled with bubbling water. The pot sits on a white electric burner with black buttons and a digital display. The burner is positioned on a white countertop with a red and white checkered cloth partially visible in the bottom right corner. The camera angle is a direct overhead shot, remaining stationary throughout the scene. The lighting is bright and even, illuminating the scene with a neutral white light. The scene is real-life footage.",
609
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
610
- "assets/i2v_2.mp4",
611
- ],
612
- [
613
- "assets/i2v_i0.png",
614
- "A woman in a long, flowing dress stands in a field, her back to the camera, gazing towards the horizon; her hair is long and light, cascading down her back; she stands beneath the sprawling branches of a large oak tree; to her left, a classic American car is parked on the dry grass; in the distance, a wrecked car lies on its side; the sky above is a dramatic canvas of bright white clouds against a darker sky; the entire image is in black and white, emphasizing the contrast of light and shadow. The woman is walking slowly towards the car.",
615
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
616
- "assets/i2v_0.mp4",
617
- ],
618
- [
619
- "assets/i2v_i1.png",
620
- "A pair of hands shapes a piece of clay on a pottery wheel, gradually forming a cone shape. The hands, belonging to a person out of frame, are covered in clay and gently press a ball of clay onto the center of a spinning pottery wheel. The hands move in a circular motion, gradually forming a cone shape at the top of the clay. The camera is positioned directly above the pottery wheel, providing a bird’s-eye view of the clay being shaped. The lighting is bright and even, illuminating the clay and the hands working on it. The scene is captured in real-life footage.",
621
- "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
622
- "assets/i2v_1.mp4",
623
- ],
624
- ],
625
- inputs=[
626
- img2vid_image,
627
- img2vid_prompt,
628
- img2vid_negative_prompt,
629
- img2vid_output,
630
- ],
631
- label="Example Image-to-Video Generations",
632
- )
633
-
634
- # [Previous event handlers remain the same]
635
- txt2vid_preset.change(
636
- fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
637
- )
638
-
639
- txt2vid_generate.click(
640
- fn=generate_video_from_text,
641
- inputs=[
642
- txt2vid_prompt,
643
- txt2vid_enhance_toggle,
644
- txt2vid_negative_prompt,
645
- txt2vid_frame_rate,
646
- *txt2vid_advanced,
647
- ],
648
- outputs=txt2vid_output,
649
- concurrency_limit=1,
650
- concurrency_id="generate_video",
651
- queue=True,
652
- )
653
-
654
- img2vid_preset.change(
655
- fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[3:]
656
- )
657
-
658
- img2vid_generate.click(
659
- fn=generate_video_from_image,
660
- inputs=[
661
- img2vid_image,
662
- img2vid_prompt,
663
- img2vid_enhance_toggle,
664
- img2vid_negative_prompt,
665
- img2vid_frame_rate,
666
- *img2vid_advanced,
667
- ],
668
- outputs=img2vid_output,
669
- concurrency_limit=1,
670
- concurrency_id="generate_video",
671
- queue=True,
672
- )
673
-
674
- if __name__ == "__main__":
675
- iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
676
- share=True, show_api=False
677
- )
 
1
+ """
2
+ Copyright NewGenAI
3
+ Code can't be included in commercial app used for monetary gain. No derivative code allowed.
4
+ """
5
+ import json
6
  import torch
7
+ import gradio as gr
8
+ import random
9
+ import time
10
+ from datetime import datetime
11
+ import os
12
 
13
+ from diffusers.utils import export_to_video
14
+ from diffusers import LTXImageToVideoPipeline
 
 
 
15
  from transformers import T5EncoderModel, T5Tokenizer
 
16
  from pathlib import Path
17
+ from datetime import datetime
18
+ from huggingface_hub import hf_hub_download
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ STATE_FILE = "LTX091_I2V_state.json"
21
+ queue = []
 
 
 
 
 
 
22
 
23
+ def load_state():
24
+ if os.path.exists(STATE_FILE):
25
+ with open(STATE_FILE, "r") as file:
26
+ return json.load(file)
27
+ return {}
28
 
29
+ def save_state(state):
30
+ with open(STATE_FILE, "w") as file:
31
+ json.dump(state, file)
 
32
 
33
+ initial_state = load_state()
34
 
35
+ def add_to_queue(image, prompt, negative_prompt, height, width, num_frames, num_inference_steps, fps, seed):
36
+ task = {
37
+ "image": image,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  "prompt": prompt,
 
39
  "negative_prompt": negative_prompt,
40
+ "height": height,
41
+ "width": width,
42
+ "num_frames": num_frames,
43
+ "num_inference_steps": num_inference_steps,
44
+ "fps": fps,
45
+ "seed": seed,
46
  }
47
+ queue.append(task)
48
+ return f"Task added to queue. Current queue length: {len(queue)}"
49
 
50
+ def clear_queue():
51
+ queue.clear()
52
+ return "Queue cleared."
53
 
54
+ def process_queue():
55
+ if not queue:
56
+ return "Queue is empty."
57
 
58
+ for i, task in enumerate(queue):
59
+ generate_video(**task)
60
+ time.sleep(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ queue.clear()
63
+ return "All tasks in the queue have been processed."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ def save_ui_state(prompt, negative_prompt, height, width, num_frames, num_inference_steps, fps, seed):
66
+ state = {
 
 
 
 
 
 
 
 
67
  "prompt": prompt,
 
68
  "negative_prompt": negative_prompt,
69
+ "height": height,
70
+ "width": width,
71
+ "num_frames": num_frames,
72
+ "num_inference_steps": num_inference_steps,
73
+ "fps": fps,
74
+ "seed": seed,
75
  }
76
+ save_state(state)
77
+ return "State saved!"
78
+
79
+ # [Previous model loading code remains the same...]
80
+ repo_id = "a-r-r-o-w/LTX-Video-0.9.1-diffusers"
81
+ base_path = repo_id
82
+ files_to_download = [
83
+ "model_index.json",
84
+ "scheduler/scheduler_config.json",
85
+ "text_encoder/config.json",
86
+ "text_encoder/model-00001-of-00004.safetensors",
87
+ "text_encoder/model-00002-of-00004.safetensors",
88
+ "text_encoder/model-00003-of-00004.safetensors",
89
+ "text_encoder/model-00004-of-00004.safetensors",
90
+ "text_encoder/model.safetensors.index.json",
91
+ "tokenizer/added_tokens.json",
92
+ "tokenizer/special_tokens_map.json",
93
+ "tokenizer/spiece.model",
94
+ "tokenizer/tokenizer_config.json",
95
+ "transformer/config.json",
96
+ "transformer/diffusion_pytorch_model.safetensors",
97
+ "vae/config.json",
98
+ "vae/diffusion_pytorch_model.safetensors",
99
+ ]
100
+ os.makedirs(base_path, exist_ok=True)
101
+ for file_path in files_to_download:
102
  try:
103
+ full_dir = os.path.join(base_path, os.path.dirname(file_path))
104
+ os.makedirs(full_dir, exist_ok=True)
105
+
106
+ downloaded_path = hf_hub_download(
107
+ repo_id=repo_id,
108
+ filename=file_path,
109
+ local_dir=base_path,
110
+ )
111
+
112
+ print(f"Successfully downloaded: {file_path}")
113
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  except Exception as e:
115
+ print(f"Error downloading {file_path}: {str(e)}")
116
+ raise
117
+
118
+ try:
119
+ full_dir = os.path.join(base_path, os.path.dirname(file_path))
120
+ os.makedirs(full_dir, exist_ok=True)
121
+
122
+ downloaded_path = hf_hub_download(
123
+ repo_id="Lightricks/LTX-Video",
124
+ filename="ltx-video-2b-v0.9.1.safetensors",
125
+ local_dir=repo_id,
126
+ )
127
+ print(f"Successfully downloaded: ltx-video-2b-v0.9.1.safetensors")
128
+ except Exception as e:
129
+ print(f"Error downloading 0.9.1 model: {str(e)}")
130
+ raise
131
 
132
+ single_file_url = repo_id+"/ltx-video-2b-v0.9.1.safetensors"
133
+ text_encoder = T5EncoderModel.from_pretrained(
134
+ repo_id, subfolder="text_encoder", torch_dtype=torch.bfloat16
135
+ )
136
+ tokenizer = T5Tokenizer.from_pretrained(
137
+ repo_id, subfolder="tokenizer", torch_dtype=torch.bfloat16
138
+ )
139
+ pipe = LTXImageToVideoPipeline.from_single_file(
140
+ single_file_url,
141
+ text_encoder=text_encoder,
142
+ tokenizer=tokenizer,
143
+ torch_dtype=torch.bfloat16
144
+ )
145
+ pipe.enable_model_cpu_offload()
146
+
147
+ def generate_video(image, prompt, negative_prompt, height, width, num_frames, num_inference_steps, fps, seed):
148
+ if seed == 0:
149
+ seed = random.randint(0, 999999)
150
+
151
+ video = pipe(
152
+ image=image,
153
+ prompt=prompt,
154
+ negative_prompt=negative_prompt,
155
+ width=width,
156
+ height=height,
157
+ num_frames=num_frames,
158
+ num_inference_steps=num_inference_steps,
159
+ generator=torch.Generator(device='cuda').manual_seed(seed),
160
+ ).frames[0]
161
+
162
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
163
+ filename = f"{prompt[:10]}_{timestamp}.mp4"
164
+
165
+ os.makedirs("output_LTX091_i2v", exist_ok=True)
166
+ output_path = f"./output_LTX091_i2v/{filename}"
167
+ export_to_video(video, output_path, fps=fps)
168
+
169
  return output_path
170
 
171
+ def randomize_seed():
172
+ return random.randint(0, 999999)
173
 
174
+ with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  with gr.Tabs():
176
+ with gr.Tab("Generate Video"):
 
177
  with gr.Row():
178
+ input_image = gr.Image(label="Input Image", type="pil")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  with gr.Row():
180
+ prompt = gr.Textbox(label="Prompt", lines=3, value=initial_state.get("prompt", "A dramatic view of the pyramids at Giza during sunset."))
181
+ negative_prompt = gr.Textbox(label="Negative Prompt", lines=3, value=initial_state.get("negative_prompt", "worst quality, blurry, distorted"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  with gr.Row():
183
+ height = gr.Slider(label="Height", minimum=240, maximum=1080, step=1, value=initial_state.get("height", 480))
184
+ width = gr.Slider(label="Width", minimum=320, maximum=1920, step=1, value=initial_state.get("width", 704))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  with gr.Row():
186
+ num_frames = gr.Slider(label="Number of Frames", minimum=1, maximum=500, step=1, value=initial_state.get("num_frames", 161))
187
+ num_inference_steps = gr.Slider(label="Number of Inference Steps", minimum=1, maximum=100, step=1, value=initial_state.get("num_inference_steps", 50))
188
+ with gr.Row():
189
+ fps = gr.Slider(label="FPS", minimum=1, maximum=60, step=1, value=initial_state.get("fps", 24))
190
+ seed = gr.Number(label="Seed", value=initial_state.get("seed", 0))
191
+ random_seed_button = gr.Button("Randomize Seed")
192
+
193
+ output_video = gr.Video(label="Generated Video", show_label=True)
194
+ generate_button = gr.Button("Generate Video")
195
+ save_state_button = gr.Button("Save State")
196
+
197
+ random_seed_button.click(lambda: random.randint(0, 999999), outputs=seed)
198
+ generate_button.click(
199
+ generate_video,
200
+ inputs=[input_image, prompt, negative_prompt, height, width, num_frames, num_inference_steps, fps, seed],
201
+ outputs=output_video
202
+ )
203
+ save_state_button.click(
204
+ save_ui_state,
205
+ inputs=[prompt, negative_prompt, height, width, num_frames, num_inference_steps, fps, seed],
206
+ outputs=gr.Text(label="State Status")
207
+ )
208
+
209
+ with gr.Tab("Batch Processing"):
210
+ with gr.Row():
211
+ batch_input_image = gr.Image(label="Input Image", type="pil")
212
+ with gr.Row():
213
+ batch_prompt = gr.Textbox(label="Prompt", lines=3, value="A batch of videos depicting different landscapes.")
214
+ batch_negative_prompt = gr.Textbox(label="Negative Prompt", lines=3, value="low quality, inconsistent, jittery")
215
+ with gr.Row():
216
+ batch_height = gr.Slider(label="Height", minimum=240, maximum=1080, step=1, value=480)
217
+ batch_width = gr.Slider(label="Width", minimum=320, maximum=1920, step=1, value=704)
218
+ with gr.Row():
219
+ batch_num_frames = gr.Slider(label="Number of Frames", minimum=1, maximum=500, step=1, value=161)
220
+ batch_num_inference_steps = gr.Slider(label="Number of Inference Steps", minimum=1, maximum=100, step=1, value=50)
221
+ with gr.Row():
222
+ batch_fps = gr.Slider(label="FPS", minimum=1, maximum=60, step=1, value=24)
223
+ batch_seed = gr.Number(label="Seed", value=0)
224
+ random_seed_batch_button = gr.Button("Randomize Seed")
225
+
226
+ add_to_queue_button = gr.Button("Add to Queue")
227
+ clear_queue_button = gr.Button("Clear Queue")
228
+ process_queue_button = gr.Button("Process Queue")
229
+
230
+ queue_status = gr.Text(label="Queue Status")
231
+
232
+ random_seed_batch_button.click(lambda: random.randint(0, 999999), outputs=batch_seed)
233
+ add_to_queue_button.click(
234
+ add_to_queue,
235
+ inputs=[batch_input_image, batch_prompt, batch_negative_prompt, batch_height, batch_width, batch_num_frames, batch_num_inference_steps, batch_fps, batch_seed],
236
+ outputs=queue_status
237
+ )
238
+ clear_queue_button.click(clear_queue, outputs=queue_status)
239
+ process_queue_button.click(process_queue, outputs=queue_status)
240
+
241
+ demo.launch()