Drexubery commited on
Commit
33fe5f3
·
1 Parent(s): ae271f8
Files changed (6) hide show
  1. __pycache__/viewcrafter.cpython-39.pyc +0 -0
  2. app.py +127 -62
  3. app_2step.py +227 -0
  4. app_new1.py +230 -0
  5. app_new2.py +238 -0
  6. viewcrafter.py +37 -7
__pycache__/viewcrafter.cpython-39.pyc CHANGED
Binary files a/__pycache__/viewcrafter.cpython-39.pyc and b/__pycache__/viewcrafter.cpython-39.pyc differ
 
app.py CHANGED
@@ -10,13 +10,20 @@ from configs.infer_config import get_parser
10
  from huggingface_hub import hf_hub_download
11
 
12
  traj_examples = [
13
- ['0 40', '0 0', '0 0'],
14
- ['0 -35', '0 0', '0 -0.1'],
15
- ['0 -3 -15 -20 -17 -5 0', '0 -2 -5 -10 -8 -5 0 2 5 3 0', '0 0'],
16
- ['0 3 10 20 17 10 0', '0 -2 -8 -6 0 2 5 3 0', '0 -0.02 -0.09 -0.16 -0.09 0'],
17
- ['0 30', '0 -1 -5 -4 0 1 5 4 0', '0 -0.2'],
18
  ]
19
 
 
 
 
 
 
 
 
 
20
  img_examples = [
21
  ['test/images/boy.png',0,1],
22
  ['test/images/car.jpeg',5,1],
@@ -62,11 +69,31 @@ print(f'>>> System info: {version_str}')
62
  from viewcrafter import ViewCrafter
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def viewcrafter_demo(opts):
66
  css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height:576px} #random_button {max-width: 100px !important}"""
67
  image2video = ViewCrafter(opts, gradio = True)
68
- image2video.run_traj = spaces.GPU(image2video.run_traj, duration=50) # fixme
69
- image2video.run_gen = spaces.GPU(image2video.run_gen, duration=260) # fixme
70
  with gr.Blocks(analytics_enabled=False, css=css) as viewcrafter_iface:
71
  gr.Markdown("<div align='center'> <h1> ViewCrafter: Taming Video Diffusion Models for High-fidelity Novel View Synthesis </span> </h1> \
72
  <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
@@ -84,71 +111,109 @@ def viewcrafter_demo(opts):
84
  <a style='font-size:18px;color: #000000' href='https://www.youtube.com/watch?v=WGIEmu9eXmU'> [Video] </a> </div>")
85
 
86
 
87
- with gr.Column():
88
- # step 0: tutorial
89
- gr.Markdown("## Step 0: Read tutorial", show_label=False)
90
- gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>Please refer to the tutorial <a href='https://github.com/Drexubery/ViewCrafter/blob/main/docs/gradio_tutorial.md' target='_blank'>here</a> for best practice, which includes the cameara system defination and the renderer parameters.</div>")
91
-
92
- # step 2: input an image
93
- gr.Markdown("---\n## Step 1: Input an Image, selet an elevation angle and a center_scale factor", show_label=False, visible=True)
94
- gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>1. Estimate an elevation angle that represents the angle at which the image was taken; a value bigger than 0 indicates a top-down view, and it doesn't need to be precise. <br>2. The origin of the world coordinate system is by default defined at the point cloud corresponding to the center pixel of the input image. You can adjust the position of the origin by modifying center_scale; a value smaller than 1 brings the origin closer to you.</div>")
95
- with gr.Row(equal_height=True):
96
- with gr.Column(scale=2):
97
- with gr.Row():
98
- i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
99
- with gr.Row():
100
- i2v_elevation = gr.Slider(minimum=-45, maximum=45, step=1, elem_id="elevation", label="elevation", value=5)
101
- i2v_center_scale = gr.Slider(minimum=0.1, maximum=2, step=0.1, elem_id="i2v_center_scale", label="center_scale", value=1)
102
- gr.Examples(examples=img_examples,
103
- inputs=[i2v_input_image,i2v_elevation,i2v_center_scale],
104
- examples_per_page=6
105
- )
106
- # step 2 - camera trajectory generation
107
- gr.Markdown("---\n## Step 2: Input camera trajectory", show_label=False, visible=True)
108
- gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> Input a d_phi sequence, a d_theta sequence, and a d_r sequence to generate a camera trajectory. In the sequences, a positive d_phi moves the camera to the right, a negative d_theta moves the camera up, and a negative d_r moves the camera forward. Ensure that each sequence starts with 0 and contains at least two elements (a start and an end). If you upload a new image, remember to conduct this step again. </div>")
109
- with gr.Row():
110
- with gr.Column():
111
- # camera_mode = gr.Radio(choices=CAMERA_MOTION_MODE, value=CAMERA_MOTION_MODE[0], label="Camera Motion Control Mode", interactive=True, visible=False)
112
- i2v_d_phi = gr.Text(label='d_phi sequence')
113
- i2v_d_theta = gr.Text(label='d_theta sequence')
114
- i2v_d_r = gr.Text(label='d_r sequence')
115
- i2v_start_btn = gr.Button("Generate trajectory")
116
- # camera_info = gr.Button(value="Proceed", visible=False)
117
- with gr.Column():
118
- i2v_traj_video = gr.Video(label="Camera Trajectory",elem_id="traj_vid",autoplay=True,show_share_button=True)
119
- gr.Examples(examples=traj_examples,
120
- inputs=[i2v_d_phi, i2v_d_theta, i2v_d_r],
121
- )
122
 
123
- # step 3 - Generate video
124
- gr.Markdown("---\n## Step 3: Generate video", show_label=False, visible=True)
125
- gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> You can reduce the sampling steps for faster inference; try different random seed if the result is not satisfying. </div>")
126
- with gr.Row():
127
- with gr.Column():
128
- i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
129
- i2v_seed = gr.Slider(label='Random seed', minimum=0, maximum=max_seed, step=1, value=0)
130
- i2v_end_btn = gr.Button("Generate video")
131
- # with gr.Tab(label='Result'):
132
  with gr.Column():
133
- i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
134
-
 
 
 
 
 
 
135
 
136
 
137
- i2v_start_btn.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale, i2v_d_phi, i2v_d_theta, i2v_d_r],
138
- outputs=[i2v_traj_video],
139
- fn = image2video.run_traj
140
- )
141
-
142
- i2v_end_btn.click(inputs=[i2v_steps, i2v_seed],
143
- outputs=[i2v_output_video],
144
- fn = image2video.run_gen
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  )
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  return viewcrafter_iface
148
 
149
 
150
  viewcrafter_iface = viewcrafter_demo(opts)
151
  viewcrafter_iface.queue(max_size=10)
152
  viewcrafter_iface.launch() #fixme
153
- # viewcrafter_iface.launch(server_name='11.220.92.96', server_port=80, max_threads=10,debug=False)
154
 
 
10
  from huggingface_hub import hf_hub_download
11
 
12
  traj_examples = [
13
+ ['0 -35; 0 0; 0 -0.1'],
14
+ ['0 -3 -15 -20 -17 -5 0; 0 -2 -5 -10 -8 -5 0 2 5 3 0; 0 0'],
15
+ ['0 3 10 20 17 10 0; 0 -2 -8 -6 0 2 5 3 0; 0 -0.02 -0.09 -0.16 -0.09 0'],
16
+ ['0 30; 0 -1 -5 -4 0 1 5 4 0; 0 -0.2'],
 
17
  ]
18
 
19
+ # img_examples = [
20
+ # ['test/images/boy.png'],
21
+ # ['test/images/car.jpeg'],
22
+ # ['test/images/fruit.jpg'],
23
+ # ['test/images/room.png'],
24
+ # ['test/images/castle.png'],
25
+ # ]
26
+
27
  img_examples = [
28
  ['test/images/boy.png',0,1],
29
  ['test/images/car.jpeg',5,1],
 
69
  from viewcrafter import ViewCrafter
70
 
71
 
72
+ CAMERA_MOTION_MODE = ["Basic Camera Trajectory", "Custom Camera Trajectory"]
73
+
74
+
75
+ def show_traj(mode):
76
+ if mode == 'Left':
77
+ return gr.update(value='0 -35; 0 0; 0 0',visible=True),gr.update(visible=False)
78
+ elif mode == 'Right':
79
+ return gr.update(value='0 35; 0 0; 0 0',visible=True),gr.update(visible=False)
80
+ elif mode == 'Up':
81
+ return gr.update(value='0 0; 0 -30; 0 0',visible=True),gr.update(visible=False)
82
+ elif mode == 'Down':
83
+ return gr.update(value='0 0; 0 20; 0 0',visible=True), gr.update(visible=False)
84
+ elif mode == 'Zoom in':
85
+ return gr.update(value='0 0; 0 0; 0 -0.4',visible=True), gr.update(visible=False)
86
+ elif mode == 'Zoom out':
87
+ return gr.update(value='0 0; 0 0; 0 0.4',visible=True), gr.update(visible=False)
88
+ elif mode == 'Customize':
89
+ return gr.update(value='0 0; 0 0; 0 0',visible=True), gr.update(visible=True)
90
+ elif mode == 'Reset':
91
+ return gr.update(value='0 0; 0 0; 0 0',visible=False), gr.update(visible=False), gr.update(visible=False)
92
+
93
  def viewcrafter_demo(opts):
94
  css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height:576px} #random_button {max-width: 100px !important}"""
95
  image2video = ViewCrafter(opts, gradio = True)
96
+ image2video.run_both = spaces.GPU(image2video.run_gen, duration=300) # fixme
 
97
  with gr.Blocks(analytics_enabled=False, css=css) as viewcrafter_iface:
98
  gr.Markdown("<div align='center'> <h1> ViewCrafter: Taming Video Diffusion Models for High-fidelity Novel View Synthesis </span> </h1> \
99
  <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
 
111
  <a style='font-size:18px;color: #000000' href='https://www.youtube.com/watch?v=WGIEmu9eXmU'> [Video] </a> </div>")
112
 
113
 
114
+ with gr.Row():
115
+ with gr.Column():
116
+ # # step 1: input an image
117
+ # gr.Markdown("---\n## Step 1: Input an Image, selet an elevation angle and a center_scale factor", show_label=False, visible=True)
118
+ # gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>1. Estimate an elevation angle that represents the angle at which the image was taken; a value bigger than 0 indicates a top-down view, and it doesn't need to be precise. <br>2. The origin of the world coordinate system is by default defined at the point cloud corresponding to the center pixel of the input image. You can adjust the position of the origin by modifying center_scale; a value smaller than 1 brings the origin closer to you.</div>")
119
+ with gr.Row():
120
+ with gr.Column():
121
+ with gr.Row():
122
+ i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
123
+ with gr.Row():
124
+ i2v_elevation = gr.Slider(minimum=-45, maximum=45, step=1, elem_id="elevation", label="elevation", value=5)
125
+ i2v_center_scale = gr.Slider(minimum=0.1, maximum=2, step=0.1, elem_id="i2v_center_scale", label="center_scale", value=1)
126
+ with gr.Column():
127
+ with gr.Row():
128
+ left = gr.Button(value = "Left")
129
+ right = gr.Button(value = "Right")
130
+ with gr.Row():
131
+ up = gr.Button(value = "Up")
132
+ down = gr.Button(value = "Down")
133
+ with gr.Row():
134
+ zin = gr.Button(value = "Zoom in")
135
+ zout = gr.Button(value = "Zoom out")
136
+ with gr.Row():
137
+ custom = gr.Button(value = "Customize")
138
+ reset = gr.Button(value = "Reset")
 
 
 
 
 
 
 
 
 
 
139
 
 
 
 
 
 
 
 
 
 
140
  with gr.Column():
141
+ with gr.Row():
142
+ with gr.Column():
143
+ i2v_pose = gr.Text(value = '0 0; 0 0; 0 0', label='poses(d_phi sequence; d_theta sequence; d_r sequence)',visible=False)
144
+ with gr.Column(visible=False) as i2v_egs:
145
+ gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>Please refer to the <a href='https://github.com/Drexubery/ViewCrafter/blob/main/docs/gradio_tutorial.md' target='_blank'>tutorial</a> for customizing camera trajectory.</div>")
146
+ gr.Examples(examples=traj_examples,
147
+ inputs=[i2v_pose],
148
+ )
149
 
150
 
151
+ # step 3 - Generate video
152
+ with gr.Column():
153
+ # gr.Markdown("---\n## Step 3: Generate video", show_label=False, visible=True)
154
+ # gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> You can reduce the sampling steps for faster inference; try different random seed if the result is not satisfying. </div>")
155
+ with gr.Row():
156
+ with gr.Column():
157
+ i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
158
+ with gr.Column():
159
+ with gr.Row():
160
+ i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
161
+ i2v_seed = gr.Slider(label='Random seed', minimum=0, maximum=max_seed, step=1, value=0)
162
+ i2v_end_btn = gr.Button("Generate video")
163
+ with gr.Column():
164
+ i2v_traj_video = gr.Video(label="Camera Trajectory",elem_id="traj_vid",autoplay=True,show_share_button=True)
165
+
166
+
167
+ gr.Examples(examples=img_examples,
168
+ inputs=[i2v_input_image,i2v_elevation, i2v_center_scale,],
169
+ # examples_per_page=6
170
+ )
171
+
172
+
173
+
174
+ i2v_end_btn.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale, i2v_pose, i2v_steps, i2v_seed],
175
+ outputs=[i2v_output_video,i2v_traj_video],
176
+ fn = image2video.run_both
177
  )
178
 
179
+ left.click(inputs=[left],
180
+ outputs=[i2v_pose,i2v_egs],
181
+ fn = show_traj
182
+ )
183
+ right.click(inputs=[right],
184
+ outputs=[i2v_pose,i2v_egs],
185
+ fn = show_traj
186
+ )
187
+ up.click(inputs=[up],
188
+ outputs=[i2v_pose,i2v_egs],
189
+ fn = show_traj
190
+ )
191
+ down.click(inputs=[down],
192
+ outputs=[i2v_pose,i2v_egs],
193
+ fn = show_traj
194
+ )
195
+ zin.click(inputs=[zin],
196
+ outputs=[i2v_pose,i2v_egs],
197
+ fn = show_traj
198
+ )
199
+ zout.click(inputs=[zout],
200
+ outputs=[i2v_pose,i2v_egs],
201
+ fn = show_traj
202
+ )
203
+ custom.click(inputs=[custom],
204
+ outputs=[i2v_pose,i2v_egs],
205
+ fn = show_traj
206
+ )
207
+ reset.click(inputs=[reset],
208
+ outputs=[i2v_pose,i2v_egs],
209
+ fn = show_traj
210
+ )
211
+
212
  return viewcrafter_iface
213
 
214
 
215
  viewcrafter_iface = viewcrafter_demo(opts)
216
  viewcrafter_iface.queue(max_size=10)
217
  viewcrafter_iface.launch() #fixme
218
+ # viewcrafter_iface.launch(server_name='11.220.92.96', server_port=80, max_threads=10,debug=True)
219
 
app_2step.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import sys
4
+ # import spaces #fixme
5
+
6
+ import random
7
+ import gradio as gr
8
+ import random
9
+ from configs.infer_config import get_parser
10
+ from huggingface_hub import hf_hub_download
11
+
12
+ traj_examples = [
13
+ ['0 -35; 0 0; 0 -0.1'],
14
+ ['0 -3 -15 -20 -17 -5 0; 0 -2 -5 -10 -8 -5 0 2 5 3 0; 0 0'],
15
+ ['0 3 10 20 17 10 0; 0 -2 -8 -6 0 2 5 3 0; 0 -0.02 -0.09 -0.16 -0.09 0'],
16
+ ['0 30; 0 -1 -5 -4 0 1 5 4 0; 0 -0.2'],
17
+ ]
18
+
19
+ # img_examples = [
20
+ # ['test/images/boy.png'],
21
+ # ['test/images/car.jpeg'],
22
+ # ['test/images/fruit.jpg'],
23
+ # ['test/images/room.png'],
24
+ # ['test/images/castle.png'],
25
+ # ]
26
+
27
+ img_examples = [
28
+ ['test/images/boy.png',0,1],
29
+ ['test/images/car.jpeg',5,1],
30
+ ['test/images/fruit.jpg',5,1],
31
+ ['test/images/room.png',10,1],
32
+ ['test/images/castle.png',-4,1],
33
+ ]
34
+
35
+ max_seed = 2 ** 31
36
+
37
+ def download_model():
38
+ REPO_ID = 'Drexubery/ViewCrafter_25'
39
+ filename_list = ['model.ckpt']
40
+ for filename in filename_list:
41
+ local_file = os.path.join('./checkpoints/', filename)
42
+ if not os.path.exists(local_file):
43
+ hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/', force_download=True)
44
+
45
+ # download_model() #fixme
46
+ parser = get_parser() # infer_config.py
47
+ opts = parser.parse_args() # default device: 'cuda:0'
48
+ tmp = str(random.randint(10**(5-1), 10**5 - 1))
49
+ opts.save_dir = f'./{tmp}'
50
+ os.makedirs(opts.save_dir,exist_ok=True)
51
+ test_tensor = torch.Tensor([0]).cuda()
52
+ opts.device = str(test_tensor.device)
53
+ # opts.config = './configs/inference_pvd_1024_gradio.yaml' #fixme
54
+ opts.config = './configs/inference_pvd_1024_local.yaml' #fixme
55
+
56
+ # # install pytorch3d # fixme
57
+ # pyt_version_str=torch.__version__.split("+")[0].replace(".", "")
58
+ # version_str="".join([
59
+ # f"py3{sys.version_info.minor}_cu",
60
+ # torch.version.cuda.replace(".",""),
61
+ # f"_pyt{pyt_version_str}"
62
+ # ])
63
+ # print(version_str)
64
+ # os.system(f"{sys.executable} -m pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html")
65
+ # os.system("mkdir -p checkpoints/ && wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth -P checkpoints/")
66
+ # print(f'>>> System info: {version_str}')
67
+
68
+
69
+ from viewcrafter import ViewCrafter
70
+
71
+
72
+ CAMERA_MOTION_MODE = ["Basic Camera Trajectory", "Custom Camera Trajectory"]
73
+
74
+
75
+ def show_traj(mode):
76
+ if mode == 'Left':
77
+ return gr.update(value='0 -35; 0 0; 0 0',visible=True), gr.update(visible=True), gr.update(visible=True),gr.update(visible=False)
78
+ elif mode == 'Right':
79
+ return gr.update(value='0 35; 0 0; 0 0',visible=True), gr.update(visible=True), gr.update(visible=True),gr.update(visible=False)
80
+ elif mode == 'Up':
81
+ return gr.update(value='0 0; 0 -30; 0 0',visible=True), gr.update(visible=True), gr.update(visible=True),gr.update(visible=False)
82
+ elif mode == 'Down':
83
+ return gr.update(value='0 0; 0 20; 0 0',visible=True), gr.update(visible=True), gr.update(visible=True),gr.update(visible=False)
84
+ elif mode == 'Zoom in':
85
+ return gr.update(value='0 0; 0 0; 0 -0.4',visible=True), gr.update(visible=True), gr.update(visible=True),gr.update(visible=False)
86
+ elif mode == 'Zoom out':
87
+ return gr.update(value='0 0; 0 0; 0 0.4',visible=True), gr.update(visible=True), gr.update(visible=True),gr.update(visible=False)
88
+ elif mode == 'Customize':
89
+ return gr.update(value='0 0; 0 0; 0 0',visible=True), gr.update(visible=True), gr.update(visible=True),gr.update(visible=True)
90
+ elif mode == 'Reset':
91
+ return gr.update(value='0 0; 0 0; 0 0',visible=False), gr.update(visible=False), gr.update(visible=False),gr.update(visible=False)
92
+
93
+ def viewcrafter_demo(opts):
94
+ css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height:576px} #random_button {max-width: 100px !important}"""
95
+ image2video = ViewCrafter(opts, gradio = True)
96
+ # image2video.run_traj_basic = spaces.GPU(image2video.run_traj_basic, duration=50) # fixme
97
+ # image2video.run_traj = spaces.GPU(image2video.run_traj, duration=50) # fixme
98
+ # image2video.run_gen = spaces.GPU(image2video.run_gen, duration=260) # fixme
99
+ with gr.Blocks(analytics_enabled=False, css=css) as viewcrafter_iface:
100
+ gr.Markdown("<div align='center'> <h1> ViewCrafter: Taming Video Diffusion Models for High-fidelity Novel View Synthesis </span> </h1> \
101
+ <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
102
+ <a href='https://scholar.google.com/citations?user=UOE8-qsAAAAJ&hl=zh-CN'>Wangbo Yu</a>, \
103
+ <a href='https://doubiiu.github.io/'>Jinbo Xing</a>, <a href=''>Li Yuan</a>, \
104
+ <a href='https://wbhu.github.io/'>Wenbo Hu</a>, <a href='https://xiaoyu258.github.io/'>Xiaoyu Li</a>,\
105
+ <a href=''>Zhipeng Huang</a>, <a href='https://scholar.google.com/citations?user=qgdesEcAAAAJ&hl=en/'>Xiangjun Gao</a>,\
106
+ <a href='https://www.cse.cuhk.edu.hk/~ttwong/myself.html/'>Tien-Tsin Wong</a>,\
107
+ <a href='https://scholar.google.com/citations?hl=en&user=4oXBp9UAAAAJ&view_op=list_works&sortby=pubdate/'>Ying Shan</a>\
108
+ <a href=''>Yonghong Tian</a>\
109
+ </h2> \
110
+ <a style='font-size:18px;color: #000000' href='https://arxiv.org/abs/2409.02048'> [ArXiv] </a>\
111
+ <a style='font-size:18px;color: #000000' href='https://drexubery.github.io/ViewCrafter/'> [Project Page] </a>\
112
+ <a style='font-size:18px;color: #FF5DB0' href='https://github.com/Drexubery/ViewCrafter'> [Github] </a>\
113
+ <a style='font-size:18px;color: #000000' href='https://www.youtube.com/watch?v=WGIEmu9eXmU'> [Video] </a> </div>")
114
+
115
+
116
+ with gr.Row():
117
+ with gr.Column():
118
+ # # step 1: input an image
119
+ # gr.Markdown("---\n## Step 1: Input an Image, selet an elevation angle and a center_scale factor", show_label=False, visible=True)
120
+ # gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>1. Estimate an elevation angle that represents the angle at which the image was taken; a value bigger than 0 indicates a top-down view, and it doesn't need to be precise. <br>2. The origin of the world coordinate system is by default defined at the point cloud corresponding to the center pixel of the input image. You can adjust the position of the origin by modifying center_scale; a value smaller than 1 brings the origin closer to you.</div>")
121
+ with gr.Row():
122
+ with gr.Column():
123
+ with gr.Row():
124
+ i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
125
+ with gr.Row():
126
+ i2v_elevation = gr.Slider(minimum=-45, maximum=45, step=1, elem_id="elevation", label="elevation", value=5)
127
+ i2v_center_scale = gr.Slider(minimum=0.1, maximum=2, step=0.1, elem_id="i2v_center_scale", label="center_scale", value=1)
128
+ with gr.Column():
129
+ with gr.Row():
130
+ left = gr.Button(value = "Left")
131
+ right = gr.Button(value = "Right")
132
+ with gr.Row():
133
+ up = gr.Button(value = "Up")
134
+ down = gr.Button(value = "Down")
135
+ with gr.Row():
136
+ zin = gr.Button(value = "Zoom in")
137
+ zout = gr.Button(value = "Zoom out")
138
+ with gr.Row():
139
+ custom = gr.Button(value = "Customize")
140
+ reset = gr.Button(value = "Reset")
141
+
142
+ with gr.Column():
143
+ with gr.Row():
144
+ with gr.Column():
145
+ i2v_pose = gr.Text(value = '0 0; 0 0; 0 0', label='poses(d_phi;d_theta;d_r)',visible=False)
146
+ with gr.Column(visible=False) as i2v_egs:
147
+ gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>Please refer to the <a href='https://github.com/Drexubery/ViewCrafter/blob/main/docs/gradio_tutorial.md' target='_blank'>tutorial</a> for customizing camera trajectory.</div>")
148
+ gr.Examples(examples=traj_examples,
149
+ inputs=[i2v_pose],
150
+ )
151
+ with gr.Column():
152
+ i2v_traj_btn = gr.Button("2.Generate camera trajectory",visible=False)
153
+ i2v_traj_video = gr.Video(label="Camera Trajectory",elem_id="traj_vid",autoplay=True,show_share_button=True,visible=False)
154
+
155
+ # step 3 - Generate video
156
+ with gr.Column():
157
+ # gr.Markdown("---\n## Step 3: Generate video", show_label=False, visible=True)
158
+ # gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> You can reduce the sampling steps for faster inference; try different random seed if the result is not satisfying. </div>")
159
+ with gr.Row():
160
+ with gr.Column():
161
+ i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
162
+ with gr.Column():
163
+ with gr.Row():
164
+ i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
165
+ i2v_seed = gr.Slider(label='Random seed', minimum=0, maximum=max_seed, step=1, value=0)
166
+ i2v_end_btn = gr.Button("3.Generate video")
167
+ # with gr.Tab(label='Result'):
168
+
169
+ gr.Examples(examples=img_examples,
170
+ inputs=[i2v_input_image,i2v_elevation, i2v_center_scale,],
171
+ # examples_per_page=6
172
+ )
173
+
174
+
175
+ # generate trajectory buttn
176
+ i2v_traj_btn.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale, i2v_pose],
177
+ outputs=[i2v_traj_video],
178
+ fn = image2video.run_traj
179
+ )
180
+
181
+
182
+ i2v_end_btn.click(inputs=[i2v_steps, i2v_seed],
183
+ outputs=[i2v_output_video],
184
+ fn = image2video.run_gen
185
+ )
186
+
187
+ left.click(inputs=[left],
188
+ outputs=[i2v_pose,i2v_traj_btn,i2v_traj_video,i2v_egs],
189
+ fn = show_traj
190
+ )
191
+ right.click(inputs=[right],
192
+ outputs=[i2v_pose,i2v_traj_btn,i2v_traj_video,i2v_egs],
193
+ fn = show_traj
194
+ )
195
+ up.click(inputs=[up],
196
+ outputs=[i2v_pose,i2v_traj_btn,i2v_traj_video,i2v_egs],
197
+ fn = show_traj
198
+ )
199
+ down.click(inputs=[down],
200
+ outputs=[i2v_pose,i2v_traj_btn,i2v_traj_video,i2v_egs],
201
+ fn = show_traj
202
+ )
203
+ zin.click(inputs=[zin],
204
+ outputs=[i2v_pose,i2v_traj_btn,i2v_traj_video,i2v_egs],
205
+ fn = show_traj
206
+ )
207
+ zout.click(inputs=[zout],
208
+ outputs=[i2v_pose,i2v_traj_btn,i2v_traj_video,i2v_egs],
209
+ fn = show_traj
210
+ )
211
+ custom.click(inputs=[custom],
212
+ outputs=[i2v_pose,i2v_traj_btn,i2v_traj_video,i2v_egs],
213
+ fn = show_traj
214
+ )
215
+ reset.click(inputs=[reset],
216
+ outputs=[i2v_pose,i2v_traj_btn,i2v_traj_video,i2v_egs],
217
+ fn = show_traj
218
+ )
219
+
220
+ return viewcrafter_iface
221
+
222
+
223
+ viewcrafter_iface = viewcrafter_demo(opts)
224
+ viewcrafter_iface.queue(max_size=10)
225
+ # viewcrafter_iface.launch() #fixme
226
+ viewcrafter_iface.launch(server_name='11.220.92.96', server_port=80, max_threads=10,debug=True)
227
+
app_new1.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import sys
4
+ # import spaces #fixme
5
+
6
+ import random
7
+ import gradio as gr
8
+ import random
9
+ from configs.infer_config import get_parser
10
+ from huggingface_hub import hf_hub_download
11
+
12
+ traj_examples = [
13
+ ['0 40', '0 0', '0 0'],
14
+ ['0 -35', '0 0', '0 -0.1'],
15
+ ['0 -3 -15 -20 -17 -5 0', '0 -2 -5 -10 -8 -5 0 2 5 3 0', '0 0'],
16
+ ['0 3 10 20 17 10 0', '0 -2 -8 -6 0 2 5 3 0', '0 -0.02 -0.09 -0.16 -0.09 0'],
17
+ ['0 30', '0 -1 -5 -4 0 1 5 4 0', '0 -0.2'],
18
+ ]
19
+
20
+ img_examples = [
21
+ ['test/images/boy.png',0,1],
22
+ ['test/images/car.jpeg',5,1],
23
+ ['test/images/fruit.jpg',5,1],
24
+ ['test/images/room.png',10,1],
25
+ ['test/images/castle.png',-4,1],
26
+ ]
27
+
28
+ max_seed = 2 ** 31
29
+
30
+ def download_model():
31
+ REPO_ID = 'Drexubery/ViewCrafter_25'
32
+ filename_list = ['model.ckpt']
33
+ for filename in filename_list:
34
+ local_file = os.path.join('./checkpoints/', filename)
35
+ if not os.path.exists(local_file):
36
+ hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/', force_download=True)
37
+
38
+ # download_model() #fixme
39
+ parser = get_parser() # infer_config.py
40
+ opts = parser.parse_args() # default device: 'cuda:0'
41
+ tmp = str(random.randint(10**(5-1), 10**5 - 1))
42
+ opts.save_dir = f'./{tmp}'
43
+ os.makedirs(opts.save_dir,exist_ok=True)
44
+ test_tensor = torch.Tensor([0]).cuda()
45
+ opts.device = str(test_tensor.device)
46
+ # opts.config = './configs/inference_pvd_1024_gradio.yaml' #fixme
47
+ opts.config = './configs/inference_pvd_1024_local.yaml' #fixme
48
+
49
+ # # install pytorch3d # fixme
50
+ # pyt_version_str=torch.__version__.split("+")[0].replace(".", "")
51
+ # version_str="".join([
52
+ # f"py3{sys.version_info.minor}_cu",
53
+ # torch.version.cuda.replace(".",""),
54
+ # f"_pyt{pyt_version_str}"
55
+ # ])
56
+ # print(version_str)
57
+ # os.system(f"{sys.executable} -m pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html")
58
+ # os.system("mkdir -p checkpoints/ && wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth -P checkpoints/")
59
+ # print(f'>>> System info: {version_str}')
60
+
61
+
62
+ from viewcrafter import ViewCrafter
63
+
64
+
65
+ CAMERA_MOTION_MODE = ["Basic Camera Trajectory", "Custom Camera Trajectory"]
66
+
67
+ def proceed(mode):
68
+ if mode == "Basic Camera Trajectory":
69
+ return gr.update(visible=True), gr.update(visible=False)
70
+ else:
71
+ return gr.update(visible=False), gr.update(visible=True)
72
+
73
+
74
+
75
+ def viewcrafter_demo(opts):
76
+ css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height:576px} #random_button {max-width: 100px !important}"""
77
+ image2video = ViewCrafter(opts, gradio = True)
78
+ # image2video.run_traj_basic = spaces.GPU(image2video.run_traj_basic, duration=50) # fixme
79
+ # image2video.run_traj = spaces.GPU(image2video.run_traj, duration=50) # fixme
80
+ # image2video.run_gen = spaces.GPU(image2video.run_gen, duration=260) # fixme
81
+ with gr.Blocks(analytics_enabled=False, css=css) as viewcrafter_iface:
82
+ gr.Markdown("<div align='center'> <h1> ViewCrafter: Taming Video Diffusion Models for High-fidelity Novel View Synthesis </span> </h1> \
83
+ <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
84
+ <a href='https://scholar.google.com/citations?user=UOE8-qsAAAAJ&hl=zh-CN'>Wangbo Yu</a>, \
85
+ <a href='https://doubiiu.github.io/'>Jinbo Xing</a>, <a href=''>Li Yuan</a>, \
86
+ <a href='https://wbhu.github.io/'>Wenbo Hu</a>, <a href='https://xiaoyu258.github.io/'>Xiaoyu Li</a>,\
87
+ <a href=''>Zhipeng Huang</a>, <a href='https://scholar.google.com/citations?user=qgdesEcAAAAJ&hl=en/'>Xiangjun Gao</a>,\
88
+ <a href='https://www.cse.cuhk.edu.hk/~ttwong/myself.html/'>Tien-Tsin Wong</a>,\
89
+ <a href='https://scholar.google.com/citations?hl=en&user=4oXBp9UAAAAJ&view_op=list_works&sortby=pubdate/'>Ying Shan</a>\
90
+ <a href=''>Yonghong Tian</a>\
91
+ </h2> \
92
+ <a style='font-size:18px;color: #000000' href='https://arxiv.org/abs/2409.02048'> [ArXiv] </a>\
93
+ <a style='font-size:18px;color: #000000' href='https://drexubery.github.io/ViewCrafter/'> [Project Page] </a>\
94
+ <a style='font-size:18px;color: #FF5DB0' href='https://github.com/Drexubery/ViewCrafter'> [Github] </a>\
95
+ <a style='font-size:18px;color: #000000' href='https://www.youtube.com/watch?v=WGIEmu9eXmU'> [Video] </a> </div>")
96
+
97
+
98
+ with gr.Column():
99
+ # # step 0: tutorial
100
+ # gr.Markdown("## Step 0: Read tutorial", show_label=False)
101
+ # gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>Please refer to the tutorial <a href='https://github.com/Drexubery/ViewCrafter/blob/main/docs/gradio_tutorial.md' target='_blank'>here</a> for best practice, which includes the cameara system defination and the renderer parameters.</div>")
102
+
103
+ # step 2: input an image
104
+ gr.Markdown("---\n## Step 1: Input an Image, selet an elevation angle and a center_scale factor", show_label=False, visible=True)
105
+ gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>1. Estimate an elevation angle that represents the angle at which the image was taken; a value bigger than 0 indicates a top-down view, and it doesn't need to be precise. <br>2. The origin of the world coordinate system is by default defined at the point cloud corresponding to the center pixel of the input image. You can adjust the position of the origin by modifying center_scale; a value smaller than 1 brings the origin closer to you.</div>")
106
+ with gr.Row(equal_height=True):
107
+ with gr.Column(scale=2):
108
+ with gr.Row():
109
+ i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
110
+ with gr.Row():
111
+ i2v_elevation = gr.Slider(minimum=-45, maximum=45, step=1, elem_id="elevation", label="elevation", value=5)
112
+ i2v_center_scale = gr.Slider(minimum=0.1, maximum=2, step=0.1, elem_id="i2v_center_scale", label="center_scale", value=1)
113
+ gr.Examples(examples=img_examples,
114
+ inputs=[i2v_input_image,i2v_elevation,i2v_center_scale],
115
+ examples_per_page=6
116
+ )
117
+
118
+ # step 2 - camera trajectory generation
119
+ gr.Markdown("---\n## Step 2: Input camera trajectory", show_label=False, visible=True)
120
+ gr.Markdown(f"\n - {CAMERA_MOTION_MODE[0]}: Select from 6 basic camera trajectory \
121
+ \n - {CAMERA_MOTION_MODE[1]}: Customize complex camera trajectory yourself \
122
+ \n - Click `Proceed` to go into next step",
123
+ show_label=False, visible=True)
124
+ with gr.Row():
125
+ camera_mode = gr.Radio(choices=CAMERA_MOTION_MODE, value=CAMERA_MOTION_MODE[0], label="Camera trajectory mode", interactive=True, visible=True)
126
+ pro_btn = gr.Button("Proceed")
127
+
128
+ with gr.Column(visible=False) as ouput1:
129
+ gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> Select one cameras trajectory. </div>")
130
+ with gr.Row():
131
+ with gr.Column():
132
+ left = gr.Button(value = "Left")
133
+ right = gr.Button(value = "Right")
134
+ up = gr.Button(value = "Up")
135
+ down = gr.Button(value = "Down")
136
+ zoomin = gr.Button(value = "Zoom in")
137
+ zoomout = gr.Button(value = "Zoom out")
138
+
139
+ with gr.Column():
140
+ i2v_traj_video1 = gr.Video(label="Camera Trajectory",elem_id="traj_vid",autoplay=True,show_share_button=True)
141
+
142
+
143
+ with gr.Column(visible=False) as ouput2:
144
+ gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> Input a d_phi sequence, a d_theta sequence, and a d_r sequence to generate a camera trajectory. In the sequences, a positive d_phi moves the camera to the right, a negative d_theta moves the camera up, and a negative d_r moves the camera forward. Ensure that each sequence starts with 0 and contains at least two elements (a start and an end). If you upload a new image, remember to conduct this step again. </div>")
145
+ with gr.Row():
146
+ with gr.Column():
147
+ # camera_mode = gr.Radio(choices=CAMERA_MOTION_MODE, value=CAMERA_MOTION_MODE[0], label="Camera Motion Control Mode", interactive=True, visible=False)
148
+ i2v_d_phi2 = gr.Text(label='d_phi sequence')
149
+ i2v_d_theta2 = gr.Text(label='d_theta sequence')
150
+ i2v_d_r2 = gr.Text(label='d_r sequence')
151
+ i2v_traj_btn2 = gr.Button("Generate custom trajectory")
152
+ # camera_info = gr.Button(value="Proceed", visible=False)
153
+ with gr.Column():
154
+ i2v_traj_video2 = gr.Video(label="Camera Trajectory",elem_id="traj_vid",autoplay=True,show_share_button=True)
155
+ with gr.Column():
156
+ gr.Examples(examples=traj_examples,
157
+ inputs=[i2v_d_phi2, i2v_d_theta2, i2v_d_r2],
158
+ )
159
+
160
+
161
+ # with gr.Column():
162
+ # i2v_traj_btn = gr.Button("Generate trajectory")
163
+ # i2v_traj_video = gr.Video(label="Camera Trajectory",elem_id="traj_vid",autoplay=True,show_share_button=True)
164
+
165
+ # step 3 - Generate video
166
+ gr.Markdown("---\n## Step 3: Generate video", show_label=False, visible=True)
167
+ gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> You can reduce the sampling steps for faster inference; try different random seed if the result is not satisfying. </div>")
168
+ with gr.Row():
169
+ with gr.Column():
170
+ i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
171
+ i2v_seed = gr.Slider(label='Random seed', minimum=0, maximum=max_seed, step=1, value=0)
172
+ i2v_end_btn = gr.Button("Generate video")
173
+ # with gr.Tab(label='Result'):
174
+ with gr.Column():
175
+ i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
176
+
177
+
178
+ pro_btn.click(inputs=[camera_mode],
179
+ outputs=[ouput1,ouput2],
180
+ fn = proceed
181
+ )
182
+
183
+
184
+ i2v_traj_btn2.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale, i2v_d_phi2, i2v_d_theta2, i2v_d_r2],
185
+ outputs=[i2v_traj_video2],
186
+ fn = image2video.run_traj
187
+ )
188
+
189
+
190
+ left.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,left],
191
+ outputs=[i2v_traj_video1],
192
+ fn = image2video.run_traj_basic
193
+ )
194
+
195
+ right.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,right],
196
+ outputs=[i2v_traj_video1],
197
+ fn = image2video.run_traj_basic
198
+ )
199
+ up.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,up],
200
+ outputs=[i2v_traj_video1],
201
+ fn = image2video.run_traj_basic
202
+ )
203
+
204
+ down.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,down],
205
+ outputs=[i2v_traj_video1],
206
+ fn = image2video.run_traj_basic
207
+ )
208
+ zoomin.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,zoomin],
209
+ outputs=[i2v_traj_video1],
210
+ fn = image2video.run_traj_basic
211
+ )
212
+
213
+ zoomout.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,zoomout],
214
+ outputs=[i2v_traj_video1],
215
+ fn = image2video.run_traj_basic
216
+ )
217
+
218
+ i2v_end_btn.click(inputs=[i2v_steps, i2v_seed],
219
+ outputs=[i2v_output_video],
220
+ fn = image2video.run_gen
221
+ )
222
+
223
+ return viewcrafter_iface
224
+
225
+
226
+ viewcrafter_iface = viewcrafter_demo(opts)
227
+ viewcrafter_iface.queue(max_size=10)
228
+ # viewcrafter_iface.launch() #fixme
229
+ viewcrafter_iface.launch(server_name='11.220.92.96', server_port=80, max_threads=10,debug=True)
230
+
app_new2.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import sys
4
+ # import spaces #fixme
5
+
6
+ import random
7
+ import gradio as gr
8
+ import random
9
+ from configs.infer_config import get_parser
10
+ from huggingface_hub import hf_hub_download
11
+
12
+ traj_examples = [
13
+ ['0 40', '0 0', '0 0'],
14
+ ['0 -35', '0 0', '0 -0.1'],
15
+ ['0 -3 -15 -20 -17 -5 0', '0 -2 -5 -10 -8 -5 0 2 5 3 0', '0 0'],
16
+ ['0 3 10 20 17 10 0', '0 -2 -8 -6 0 2 5 3 0', '0 -0.02 -0.09 -0.16 -0.09 0'],
17
+ ['0 30', '0 -1 -5 -4 0 1 5 4 0', '0 -0.2'],
18
+ ]
19
+
20
+ # img_examples = [
21
+ # ['test/images/boy.png'],
22
+ # ['test/images/car.jpeg'],
23
+ # ['test/images/fruit.jpg'],
24
+ # ['test/images/room.png'],
25
+ # ['test/images/castle.png'],
26
+ # ]
27
+
28
+ img_examples = [
29
+ ['test/images/boy.png',0,1],
30
+ ['test/images/car.jpeg',5,1],
31
+ ['test/images/fruit.jpg',5,1],
32
+ ['test/images/room.png',10,1],
33
+ ['test/images/castle.png',-4,1],
34
+ ]
35
+
36
+ max_seed = 2 ** 31
37
+
38
+ def download_model():
39
+ REPO_ID = 'Drexubery/ViewCrafter_25'
40
+ filename_list = ['model.ckpt']
41
+ for filename in filename_list:
42
+ local_file = os.path.join('./checkpoints/', filename)
43
+ if not os.path.exists(local_file):
44
+ hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/', force_download=True)
45
+
46
+ # download_model() #fixme
47
+ parser = get_parser() # infer_config.py
48
+ opts = parser.parse_args() # default device: 'cuda:0'
49
+ tmp = str(random.randint(10**(5-1), 10**5 - 1))
50
+ opts.save_dir = f'./{tmp}'
51
+ os.makedirs(opts.save_dir,exist_ok=True)
52
+ test_tensor = torch.Tensor([0]).cuda()
53
+ opts.device = str(test_tensor.device)
54
+ # opts.config = './configs/inference_pvd_1024_gradio.yaml' #fixme
55
+ opts.config = './configs/inference_pvd_1024_local.yaml' #fixme
56
+
57
+ # # install pytorch3d # fixme
58
+ # pyt_version_str=torch.__version__.split("+")[0].replace(".", "")
59
+ # version_str="".join([
60
+ # f"py3{sys.version_info.minor}_cu",
61
+ # torch.version.cuda.replace(".",""),
62
+ # f"_pyt{pyt_version_str}"
63
+ # ])
64
+ # print(version_str)
65
+ # os.system(f"{sys.executable} -m pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html")
66
+ # os.system("mkdir -p checkpoints/ && wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth -P checkpoints/")
67
+ # print(f'>>> System info: {version_str}')
68
+
69
+
70
+ from viewcrafter import ViewCrafter
71
+
72
+
73
+ CAMERA_MOTION_MODE = ["Basic Camera Trajectory", "Custom Camera Trajectory"]
74
+
75
+ def proceed(mode):
76
+ if mode == "Basic Camera Trajectory":
77
+ return gr.update(visible=True), gr.update(visible=False)
78
+ else:
79
+ return gr.update(visible=False), gr.update(visible=True)
80
+
81
+
82
+
83
+ def viewcrafter_demo(opts):
84
+ css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height:576px} #random_button {max-width: 100px !important}"""
85
+ image2video = ViewCrafter(opts, gradio = True)
86
+ # image2video.run_traj_basic = spaces.GPU(image2video.run_traj_basic, duration=50) # fixme
87
+ # image2video.run_traj = spaces.GPU(image2video.run_traj, duration=50) # fixme
88
+ # image2video.run_gen = spaces.GPU(image2video.run_gen, duration=260) # fixme
89
+ with gr.Blocks(analytics_enabled=False, css=css) as viewcrafter_iface:
90
+ gr.Markdown("<div align='center'> <h1> ViewCrafter: Taming Video Diffusion Models for High-fidelity Novel View Synthesis </span> </h1> \
91
+ <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
92
+ <a href='https://scholar.google.com/citations?user=UOE8-qsAAAAJ&hl=zh-CN'>Wangbo Yu</a>, \
93
+ <a href='https://doubiiu.github.io/'>Jinbo Xing</a>, <a href=''>Li Yuan</a>, \
94
+ <a href='https://wbhu.github.io/'>Wenbo Hu</a>, <a href='https://xiaoyu258.github.io/'>Xiaoyu Li</a>,\
95
+ <a href=''>Zhipeng Huang</a>, <a href='https://scholar.google.com/citations?user=qgdesEcAAAAJ&hl=en/'>Xiangjun Gao</a>,\
96
+ <a href='https://www.cse.cuhk.edu.hk/~ttwong/myself.html/'>Tien-Tsin Wong</a>,\
97
+ <a href='https://scholar.google.com/citations?hl=en&user=4oXBp9UAAAAJ&view_op=list_works&sortby=pubdate/'>Ying Shan</a>\
98
+ <a href=''>Yonghong Tian</a>\
99
+ </h2> \
100
+ <a style='font-size:18px;color: #000000' href='https://arxiv.org/abs/2409.02048'> [ArXiv] </a>\
101
+ <a style='font-size:18px;color: #000000' href='https://drexubery.github.io/ViewCrafter/'> [Project Page] </a>\
102
+ <a style='font-size:18px;color: #FF5DB0' href='https://github.com/Drexubery/ViewCrafter'> [Github] </a>\
103
+ <a style='font-size:18px;color: #000000' href='https://www.youtube.com/watch?v=WGIEmu9eXmU'> [Video] </a> </div>")
104
+
105
+
106
+ with gr.Row():
107
+ with gr.Column():
108
+ # # step 1: input an image
109
+ # gr.Markdown("---\n## Step 1: Input an Image, selet an elevation angle and a center_scale factor", show_label=False, visible=True)
110
+ # gr.Markdown("<div align='left' style='font-size:18px;color: #000000'>1. Estimate an elevation angle that represents the angle at which the image was taken; a value bigger than 0 indicates a top-down view, and it doesn't need to be precise. <br>2. The origin of the world coordinate system is by default defined at the point cloud corresponding to the center pixel of the input image. You can adjust the position of the origin by modifying center_scale; a value smaller than 1 brings the origin closer to you.</div>")
111
+ with gr.Row():
112
+ with gr.Column():
113
+ with gr.Row():
114
+ i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
115
+ with gr.Row():
116
+ i2v_elevation = gr.Slider(minimum=-45, maximum=45, step=1, elem_id="elevation", label="elevation", value=5)
117
+ i2v_center_scale = gr.Slider(minimum=0.1, maximum=2, step=0.1, elem_id="i2v_center_scale", label="center_scale", value=1)
118
+
119
+ # with gr.Column():
120
+ # step 2 - camera trajectory generation
121
+ # gr.Markdown("---\n## Step 2: Input camera trajectory", show_label=False, visible=True)
122
+ # gr.Markdown(f"\n - {CAMERA_MOTION_MODE[0]}: Select from 6 basic camera trajectory \
123
+ # \n - {CAMERA_MOTION_MODE[1]}: Customize complex camera trajectory yourself \
124
+ # \n - Click `Proceed` to go into next step",
125
+ # show_label=False, visible=True)
126
+ with gr.Column():
127
+ camera_mode = gr.Radio(choices=CAMERA_MOTION_MODE, value=CAMERA_MOTION_MODE[0], label="Camera trajectory mode", interactive=True, visible=True)
128
+ pro_btn = gr.Button("1.Select camera trajectory mode")
129
+
130
+ with gr.Column(visible=False) as ouput1:
131
+ gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> 2.Click on one basic trajectory </div>")
132
+ with gr.Row():
133
+ left = gr.Button(value = "Left")
134
+ right = gr.Button(value = "Right")
135
+ with gr.Row():
136
+ up = gr.Button(value = "Up")
137
+ down = gr.Button(value = "Down")
138
+ with gr.Row():
139
+ zoomin = gr.Button(value = "Zoom in")
140
+ zoomout = gr.Button(value = "Zoom out")
141
+
142
+ with gr.Column():
143
+ i2v_traj_video1 = gr.Video(label="Camera Trajectory",elem_id="traj_vid",autoplay=True,show_share_button=True)
144
+
145
+
146
+ with gr.Column(visible=False) as ouput2:
147
+ gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> Input a d_phi sequence, a d_theta sequence, and a d_r sequence, then click 'Generate custom trajectory' <a href='https://github.com/Drexubery/ViewCrafter/blob/main/docs/gradio_tutorial.md' target='_blank'>(Tutorial)</a> </div>")
148
+ with gr.Row():
149
+ with gr.Column():
150
+ # camera_mode = gr.Radio(choices=CAMERA_MOTION_MODE, value=CAMERA_MOTION_MODE[0], label="Camera Motion Control Mode", interactive=True, visible=False)
151
+ i2v_d_phi2 = gr.Text(label='d_phi sequence')
152
+ i2v_d_theta2 = gr.Text(label='d_theta sequence')
153
+ i2v_d_r2 = gr.Text(label='d_r sequence')
154
+ i2v_traj_btn2 = gr.Button("2.Generate custom trajectory")
155
+ # camera_info = gr.Button(value="Proceed", visible=False)
156
+ with gr.Column():
157
+ i2v_traj_video2 = gr.Video(label="Camera Trajectory",elem_id="traj_vid",autoplay=True,show_share_button=True)
158
+ with gr.Column():
159
+ gr.Examples(examples=traj_examples,
160
+ inputs=[i2v_d_phi2, i2v_d_theta2, i2v_d_r2],
161
+ )
162
+
163
+ # with gr.Column():
164
+ # i2v_traj_btn = gr.Button("Generate trajectory")
165
+ # i2v_traj_video = gr.Video(label="Camera Trajectory",elem_id="traj_vid",autoplay=True,show_share_button=True)
166
+
167
+ # step 3 - Generate video
168
+ with gr.Column():
169
+ # gr.Markdown("---\n## Step 3: Generate video", show_label=False, visible=True)
170
+ # gr.Markdown("<div align='left' style='font-size:18px;color: #000000'> You can reduce the sampling steps for faster inference; try different random seed if the result is not satisfying. </div>")
171
+ with gr.Row():
172
+ with gr.Column():
173
+ i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
174
+ with gr.Column():
175
+ with gr.Row():
176
+ i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
177
+ i2v_seed = gr.Slider(label='Random seed', minimum=0, maximum=max_seed, step=1, value=0)
178
+ i2v_end_btn = gr.Button("3.Generate video")
179
+ # with gr.Tab(label='Result'):
180
+
181
+ gr.Examples(examples=img_examples,
182
+ inputs=[i2v_input_image,i2v_elevation, i2v_center_scale,],
183
+ # examples_per_page=6
184
+ )
185
+
186
+ pro_btn.click(inputs=[camera_mode],
187
+ outputs=[ouput1,ouput2],
188
+ fn = proceed
189
+ )
190
+
191
+ # generate trajectory buttn
192
+ i2v_traj_btn2.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale, i2v_d_phi2, i2v_d_theta2, i2v_d_r2],
193
+ outputs=[i2v_traj_video2],
194
+ fn = image2video.run_traj
195
+ )
196
+
197
+
198
+ left.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,left],
199
+ outputs=[i2v_traj_video1],
200
+ fn = image2video.run_traj_basic
201
+ )
202
+
203
+ right.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,right],
204
+ outputs=[i2v_traj_video1],
205
+ fn = image2video.run_traj_basic
206
+ )
207
+ up.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,up],
208
+ outputs=[i2v_traj_video1],
209
+ fn = image2video.run_traj_basic
210
+ )
211
+
212
+ down.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,down],
213
+ outputs=[i2v_traj_video1],
214
+ fn = image2video.run_traj_basic
215
+ )
216
+ zoomin.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,zoomin],
217
+ outputs=[i2v_traj_video1],
218
+ fn = image2video.run_traj_basic
219
+ )
220
+
221
+ zoomout.click(inputs=[i2v_input_image, i2v_elevation, i2v_center_scale,zoomout],
222
+ outputs=[i2v_traj_video1],
223
+ fn = image2video.run_traj_basic
224
+ )
225
+
226
+ i2v_end_btn.click(inputs=[i2v_steps, i2v_seed],
227
+ outputs=[i2v_output_video],
228
+ fn = image2video.run_gen
229
+ )
230
+
231
+ return viewcrafter_iface
232
+
233
+
234
+ viewcrafter_iface = viewcrafter_demo(opts)
235
+ viewcrafter_iface.queue(max_size=10)
236
+ # viewcrafter_iface.launch() #fixme
237
+ viewcrafter_iface.launch(server_name='11.220.92.96', server_port=80, max_threads=10,debug=True)
238
+
viewcrafter.py CHANGED
@@ -160,9 +160,8 @@ class ViewCrafter:
160
  render_results[-1] = self.img_ori
161
  # torch.Size([25, 576, 1024, 3]), [0,1]
162
  # save_pointcloud_with_normals([imgs[-1]], [pcd[-1]], msk=None, save_path=os.path.join(self.opts.save_dir,'pcd0.ply') , mask_pc=False, reduce_pc=False)
163
- # diffusion_results = self.run_diffusion(render_results)
164
- # save_video((diffusion_results + 1.0) / 2.0, os.path.join(self.opts.save_dir, 'diffusion0.mp4'))
165
-
166
  return render_results
167
 
168
  def nvs_sparse_view(self,iter):
@@ -349,13 +348,14 @@ class ViewCrafter:
349
 
350
  return images, img_ori
351
 
352
- def run_traj(self,i2v_input_image, i2v_elevation, i2v_center_scale, i2v_d_phi, i2v_d_theta, i2v_d_r):
353
  self.opts.elevation = float(i2v_elevation)
354
  self.opts.center_scale = float(i2v_center_scale)
 
355
  self.gradio_traj = [float(i) for i in i2v_d_phi.split()],[float(i) for i in i2v_d_theta.split()],[float(i) for i in i2v_d_r.split()]
356
  transform = transforms.Compose([
357
- transforms.Resize(576),
358
- transforms.CenterCrop((576,1024)),
359
  ])
360
  torch.cuda.empty_cache()
361
  img_tensor = torch.from_numpy(i2v_input_image).permute(2, 0, 1).unsqueeze(0).float().to(self.device)
@@ -392,4 +392,34 @@ class ViewCrafter:
392
  gen_dir = os.path.join(self.opts.save_dir, "diffusion0.mp4")
393
  diffusion_results = self.run_diffusion(render_results)
394
  save_video((diffusion_results + 1.0) / 2.0, os.path.join(self.opts.save_dir, 'diffusion0.mp4'))
395
- return gen_dir
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  render_results[-1] = self.img_ori
161
  # torch.Size([25, 576, 1024, 3]), [0,1]
162
  # save_pointcloud_with_normals([imgs[-1]], [pcd[-1]], msk=None, save_path=os.path.join(self.opts.save_dir,'pcd0.ply') , mask_pc=False, reduce_pc=False)
163
+ diffusion_results = self.run_diffusion(render_results)
164
+ save_video((diffusion_results + 1.0) / 2.0, os.path.join(self.opts.save_dir, 'diffusion0.mp4'))
 
165
  return render_results
166
 
167
  def nvs_sparse_view(self,iter):
 
348
 
349
  return images, img_ori
350
 
351
+ def run_traj(self,i2v_input_image, i2v_elevation, i2v_center_scale, i2v_pose):
352
  self.opts.elevation = float(i2v_elevation)
353
  self.opts.center_scale = float(i2v_center_scale)
354
+ i2v_d_phi,i2v_d_theta,i2v_d_r = [i for i in i2v_pose.split(';')]
355
  self.gradio_traj = [float(i) for i in i2v_d_phi.split()],[float(i) for i in i2v_d_theta.split()],[float(i) for i in i2v_d_r.split()]
356
  transform = transforms.Compose([
357
+ transforms.Resize((576,1024)),
358
+ # transforms.CenterCrop((576,1024)),
359
  ])
360
  torch.cuda.empty_cache()
361
  img_tensor = torch.from_numpy(i2v_input_image).permute(2, 0, 1).unsqueeze(0).float().to(self.device)
 
392
  gen_dir = os.path.join(self.opts.save_dir, "diffusion0.mp4")
393
  diffusion_results = self.run_diffusion(render_results)
394
  save_video((diffusion_results + 1.0) / 2.0, os.path.join(self.opts.save_dir, 'diffusion0.mp4'))
395
+ return gen_dir
396
+
397
+ def run_both(self,i2v_input_image, i2v_elevation, i2v_center_scale, i2v_pose,i2v_steps, i2v_seed):
398
+ self.opts.ddim_steps = i2v_steps
399
+ seed_everything(i2v_seed)
400
+ self.opts.elevation = float(i2v_elevation)
401
+ self.opts.center_scale = float(i2v_center_scale)
402
+ i2v_d_phi,i2v_d_theta,i2v_d_r = [i for i in i2v_pose.split(';')]
403
+ self.gradio_traj = [float(i) for i in i2v_d_phi.split()],[float(i) for i in i2v_d_theta.split()],[float(i) for i in i2v_d_r.split()]
404
+ transform = transforms.Compose([
405
+ transforms.Resize((576,1024)),
406
+ # transforms.CenterCrop((576,1024)),
407
+ ])
408
+ torch.cuda.empty_cache()
409
+ img_tensor = torch.from_numpy(i2v_input_image).permute(2, 0, 1).unsqueeze(0).float().to(self.device)
410
+ img_tensor = (img_tensor / 255. - 0.5) * 2
411
+ image_tensor_resized = transform(img_tensor) #1,3,h,w
412
+ images = get_input_dict(image_tensor_resized,idx = 0,dtype = torch.float32)
413
+ images = [images, copy.deepcopy(images)]
414
+ images[1]['idx'] = 1
415
+ self.images = images
416
+ self.img_ori = (image_tensor_resized.squeeze(0).permute(1,2,0) + 1.)/2.
417
+
418
+ # self.images: torch.Size([1, 3, 288, 512]), [-1,1]
419
+ # self.img_ori: torch.Size([576, 1024, 3]), [0,1]
420
+ # self.images, self.img_ori = self.load_initial_images(image_dir=i2v_input_image)
421
+ self.run_dust3r(input_images=self.images)
422
+ self.nvs_single_view(gradio=True)
423
+ traj_dir = os.path.join(self.opts.save_dir, "viz_traj.mp4")
424
+ gen_dir = os.path.join(self.opts.save_dir, "diffusion0.mp4")
425
+ return gen_dir,traj_dir,