Spaces:

Delik
/

Anitalker

Running on Zero

App Files Files Community

Delik commited on Aug 4, 2024

Commit

959fd21

verified ·

1 Parent(s): d0d6aa5

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -39

app.py CHANGED Viewed

@@ -28,12 +28,9 @@ import os
 import time
 import numpy as np
 # Disable Gradio analytics to avoid network-related issues
 gr.analytics_enabled = False
 def check_package_installed(package_name):
     package_spec = importlib.util.find_spec(package_name)
     if package_spec is None:
@@ -77,11 +74,11 @@ def main(args):
     audio_name = os.path.splitext(os.path.basename(args.test_audio_path))[0]
     predicted_video_256_path = os.path.join(args.result_path,  f'{test_image_name}-{audio_name}.mp4')
     predicted_video_512_path = os.path.join(args.result_path,  f'{test_image_name}-{audio_name}_SR.mp4')
     #======Loading Stage 1 model=========
     lia = LIA_Model(motion_dim=args.motion_dim, fusion_type='weighted_sum')
     lia.load_lightning_model(args.stage1_checkpoint_path)
-    lia.to(args.device)
     #============================
     conf = ffhq256_autoenc()
@@ -122,7 +119,7 @@ def main(args):
         print(f'{args.test_audio_path} does not exist!')
         exit(0)
-    img_source = img_preprocessing(args.test_image_path, args.image_size).to(args.device)
     one_shot_lia_start, one_shot_lia_direction, feats = lia.get_start_direction_code(img_source, img_source, img_source, img_source)
     #======Loading Stage 2 model=========
@@ -130,7 +127,7 @@ def main(args):
     state = torch.load(args.stage2_checkpoint_path, map_location='cpu')
     model.load_state_dict(state, strict=True)
     model.ema_model.eval()
-    model.ema_model.to(args.device)
     #=================================
     #======Audio Input=========
@@ -144,7 +141,7 @@ def main(args):
         frame_start, frame_end = 0, int(audio_driven_obj.shape[0]/4)
         audio_start, audio_end = int(frame_start * 4), int(frame_end * 4) # The video frame is fixed to 25 hz and the audio is fixed to 100 hz
-        audio_driven = torch.Tensor(audio_driven_obj[audio_start:audio_end,:]).unsqueeze(0).float().to(args.device)
     elif conf.infer_type.startswith('hubert'):
         # Hubert features
@@ -163,7 +160,7 @@ def main(args):
             # load hubert model
             from transformers import Wav2Vec2FeatureExtractor, HubertModel
-            audio_model = HubertModel.from_pretrained(hubert_model_path).to(args.device)
             feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(hubert_model_path)
             audio_model.feature_extractor._freeze_parameters()
             audio_model.eval()
@@ -171,7 +168,7 @@ def main(args):
             # hubert model forward pass
             audio, sr = librosa.load(args.test_audio_path, sr=16000)
             input_values = feature_extractor(audio, sampling_rate=16000, padding=True, do_normalize=True, return_tensors="pt").input_values
-            input_values = input_values.to(args.device)
             ws_feats = []
             with torch.no_grad():
                 outputs = audio_model(input_values, output_hidden_states=True)
@@ -192,11 +189,11 @@ def main(args):
         frame_start, frame_end = 0, int(audio_driven_obj.shape[1]/2)
         audio_start, audio_end = int(frame_start * 2), int(frame_end * 2) # The video frame is fixed to 25 hz and the audio is fixed to 50 hz
-        audio_driven = torch.Tensor(audio_driven_obj[:,audio_start:audio_end,:]).unsqueeze(0).float().to(args.device)
     #============================
     # Diffusion Noise
-    noisyT = torch.randn((1,frame_end, args.motion_dim)).to(args.device)
     #======Inputs for Attribute Control=========
     if os.path.exists(args.pose_driven_path):
@@ -215,17 +212,17 @@ def main(args):
             padding = np.tile(pose_obj[-1, :], (frame_end - pose_obj.shape[0], 1))
             pose_obj = np.vstack((pose_obj, padding))
-        pose_signal = torch.Tensor(pose_obj).unsqueeze(0).to(args.device) / 90 # 90 is for normalization here
     else:
-        yaw_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_yaw
-        pitch_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_pitch
-        roll_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_roll
         pose_signal = torch.cat((yaw_signal, pitch_signal, roll_signal), dim=-1)
     pose_signal = torch.clamp(pose_signal, -1, 1)
-    face_location_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.face_location
-    face_scae_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.face_scale
     #===========================================
     start_time = time.time()
@@ -242,7 +239,7 @@ def main(args):
     start_time = time.time()
     #======Rendering images frame-by-frame=========
     for pred_index in tqdm(range(generated_directions.shape[1])):
-        ori_img_recon = lia.render(one_shot_lia_start, torch.Tensor(generated_directions[:,pred_index,:]).to(args.device), feats)
         ori_img_recon = ori_img_recon.clamp(-1, 1)
         wav_pred = (ori_img_recon.detach() + 1) / 2
         saved_image(wav_pred, os.path.join(frames_result_saved_path, "%06d.png"%(pred_index)))
@@ -276,8 +273,9 @@ def main(args):
     else:
         return predicted_video_256_path, predicted_video_256_path
 def generate_video(uploaded_img, uploaded_audio, infer_type,
-        pose_yaw, pose_pitch, pose_roll, face_location, face_scale, step_T, device, face_sr, seed):
     if uploaded_img is None or uploaded_audio is None:
         return None, gr.Markdown("Error: Input image or audio file is empty. Please check and upload both files.")
@@ -289,14 +287,6 @@ def generate_video(uploaded_img, uploaded_audio, infer_type,
         "hubert_full_control": "ckpt/stage2_full_control_hubert.ckpt",
     }
-    # if face_crop:
-    #     uploaded_img_path = Path(uploaded_img)
-    #     cropped_img_path = uploaded_img_path.with_name(uploaded_img_path.stem + "_crop" + uploaded_img_path.suffix)
-    #     crop_image(uploaded_img, cropped_img_path)
-    #     uploaded_img = str(cropped_img_path)
-    # import pdb;pdb.set_trace()
     stage2_checkpoint_path = model_mapping.get(infer_type, "default_checkpoint.ckpt")
     try:
         args = argparse.Namespace(
@@ -317,19 +307,14 @@ def generate_video(uploaded_img, uploaded_audio, infer_type,
             face_scale=face_scale,
             step_T=step_T,
             image_size=256,
-            device=device,
             motion_dim=20,
             decoder_layers=2,
             face_sr=face_sr
         )
-        # Save the uploaded audio to the expected path
-        # shutil.copy(uploaded_audio, args.test_audio_path)
-        # Run the main function
         output_256_video_path, output_512_video_path = main(args)
-        # Check if the output video file exists
         if not os.path.exists(output_256_video_path):
             return None, gr.Markdown("Error: Video generation failed. Please check your inputs and try again.")
         if output_256_video_path == output_512_video_path:
@@ -347,7 +332,6 @@ default_values = {
     "face_scale": 0.5,
     "step_T": 50,
     "seed": 0,
-    "device": "cuda"
 }
 with gr.Blocks() as demo:
@@ -373,8 +357,6 @@ with gr.Blocks() as demo:
             value='hubert_audio_only'
         )
         face_sr = gr.Checkbox(label="Enable Face Super-Resolution (512*512)", value=False)
-        # face_crop = gr.Checkbox(label="Face Crop (Dlib)", value=False)
-        # face_crop = False # TODO
         seed = gr.Number(label="Seed", value=default_values["seed"])
         pose_yaw = gr.Slider(label="pose_yaw", minimum=-1, maximum=1, value=default_values["pose_yaw"])
         pose_pitch = gr.Slider(label="pose_pitch", minimum=-1, maximum=1, value=default_values["pose_pitch"])
@@ -382,14 +364,13 @@ with gr.Blocks() as demo:
         face_location = gr.Slider(label="face_location", minimum=0, maximum=1, value=default_values["face_location"])
         face_scale = gr.Slider(label="face_scale", minimum=0, maximum=1, value=default_values["face_scale"])
         step_T = gr.Slider(label="step_T", minimum=1, maximum=100, step=1, value=default_values["step_T"])
-        device = gr.Radio(label="Device", choices=["cuda", "cpu"], value=default_values["device"])
     generate_button.click(
         generate_video,
         inputs=[
             uploaded_img, uploaded_audio, infer_type,
-            pose_yaw, pose_pitch, pose_roll, face_location, face_scale, step_T, device, face_sr, seed
         ],
         outputs=[output_video_256, output_video_512, output_message]
     )

 import time
 import numpy as np
 # Disable Gradio analytics to avoid network-related issues
 gr.analytics_enabled = False
 def check_package_installed(package_name):
     package_spec = importlib.util.find_spec(package_name)
     if package_spec is None:
     audio_name = os.path.splitext(os.path.basename(args.test_audio_path))[0]
     predicted_video_256_path = os.path.join(args.result_path,  f'{test_image_name}-{audio_name}.mp4')
     predicted_video_512_path = os.path.join(args.result_path,  f'{test_image_name}-{audio_name}_SR.mp4')
     #======Loading Stage 1 model=========
     lia = LIA_Model(motion_dim=args.motion_dim, fusion_type='weighted_sum')
     lia.load_lightning_model(args.stage1_checkpoint_path)
+    lia.to('cuda')
     #============================
     conf = ffhq256_autoenc()
         print(f'{args.test_audio_path} does not exist!')
         exit(0)
+    img_source = img_preprocessing(args.test_image_path, args.image_size).to('cuda')
     one_shot_lia_start, one_shot_lia_direction, feats = lia.get_start_direction_code(img_source, img_source, img_source, img_source)
     #======Loading Stage 2 model=========
     state = torch.load(args.stage2_checkpoint_path, map_location='cpu')
     model.load_state_dict(state, strict=True)
     model.ema_model.eval()
+    model.ema_model.to('cuda')
     #=================================
     #======Audio Input=========
         frame_start, frame_end = 0, int(audio_driven_obj.shape[0]/4)
         audio_start, audio_end = int(frame_start * 4), int(frame_end * 4) # The video frame is fixed to 25 hz and the audio is fixed to 100 hz
+        audio_driven = torch.Tensor(audio_driven_obj[audio_start:audio_end,:]).unsqueeze(0).float().to('cuda')
     elif conf.infer_type.startswith('hubert'):
         # Hubert features
             # load hubert model
             from transformers import Wav2Vec2FeatureExtractor, HubertModel
+            audio_model = HubertModel.from_pretrained(hubert_model_path).to('cuda')
             feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(hubert_model_path)
             audio_model.feature_extractor._freeze_parameters()
             audio_model.eval()
             # hubert model forward pass
             audio, sr = librosa.load(args.test_audio_path, sr=16000)
             input_values = feature_extractor(audio, sampling_rate=16000, padding=True, do_normalize=True, return_tensors="pt").input_values
+            input_values = input_values.to('cuda')
             ws_feats = []
             with torch.no_grad():
                 outputs = audio_model(input_values, output_hidden_states=True)
         frame_start, frame_end = 0, int(audio_driven_obj.shape[1]/2)
         audio_start, audio_end = int(frame_start * 2), int(frame_end * 2) # The video frame is fixed to 25 hz and the audio is fixed to 50 hz
+        audio_driven = torch.Tensor(audio_driven_obj[:,audio_start:audio_end,:]).unsqueeze(0).float().to('cuda')
     #============================
     # Diffusion Noise
+    noisyT = torch.randn((1,frame_end, args.motion_dim)).to('cuda')
     #======Inputs for Attribute Control=========
     if os.path.exists(args.pose_driven_path):
             padding = np.tile(pose_obj[-1, :], (frame_end - pose_obj.shape[0], 1))
             pose_obj = np.vstack((pose_obj, padding))
+        pose_signal = torch.Tensor(pose_obj).unsqueeze(0).to('cuda') / 90 # 90 is for normalization here
     else:
+        yaw_signal = torch.zeros(1, frame_end, 1).to('cuda') + args.pose_yaw
+        pitch_signal = torch.zeros(1, frame_end, 1).to('cuda') + args.pose_pitch
+        roll_signal = torch.zeros(1, frame_end, 1).to('cuda') + args.pose_roll
         pose_signal = torch.cat((yaw_signal, pitch_signal, roll_signal), dim=-1)
     pose_signal = torch.clamp(pose_signal, -1, 1)
+    face_location_signal = torch.zeros(1, frame_end, 1).to('cuda') + args.face_location
+    face_scae_signal = torch.zeros(1, frame_end, 1).to('cuda') + args.face_scale
     #===========================================
     start_time = time.time()
     start_time = time.time()
     #======Rendering images frame-by-frame=========
     for pred_index in tqdm(range(generated_directions.shape[1])):
+        ori_img_recon = lia.render(one_shot_lia_start, torch.Tensor(generated_directions[:,pred_index,:]).to('cuda'), feats)
         ori_img_recon = ori_img_recon.clamp(-1, 1)
         wav_pred = (ori_img_recon.detach() + 1) / 2
         saved_image(wav_pred, os.path.join(frames_result_saved_path, "%06d.png"%(pred_index)))
     else:
         return predicted_video_256_path, predicted_video_256_path
+@spaces.GPU
 def generate_video(uploaded_img, uploaded_audio, infer_type,
+        pose_yaw, pose_pitch, pose_roll, face_location, face_scale, step_T, face_sr, seed):
     if uploaded_img is None or uploaded_audio is None:
         return None, gr.Markdown("Error: Input image or audio file is empty. Please check and upload both files.")
         "hubert_full_control": "ckpt/stage2_full_control_hubert.ckpt",
     }
     stage2_checkpoint_path = model_mapping.get(infer_type, "default_checkpoint.ckpt")
     try:
         args = argparse.Namespace(
             face_scale=face_scale,
             step_T=step_T,
             image_size=256,
+            device='cuda',
             motion_dim=20,
             decoder_layers=2,
             face_sr=face_sr
         )
         output_256_video_path, output_512_video_path = main(args)
         if not os.path.exists(output_256_video_path):
             return None, gr.Markdown("Error: Video generation failed. Please check your inputs and try again.")
         if output_256_video_path == output_512_video_path:
     "face_scale": 0.5,
     "step_T": 50,
     "seed": 0,
 }
 with gr.Blocks() as demo:
             value='hubert_audio_only'
         )
         face_sr = gr.Checkbox(label="Enable Face Super-Resolution (512*512)", value=False)
         seed = gr.Number(label="Seed", value=default_values["seed"])
         pose_yaw = gr.Slider(label="pose_yaw", minimum=-1, maximum=1, value=default_values["pose_yaw"])
         pose_pitch = gr.Slider(label="pose_pitch", minimum=-1, maximum=1, value=default_values["pose_pitch"])
         face_location = gr.Slider(label="face_location", minimum=0, maximum=1, value=default_values["face_location"])
         face_scale = gr.Slider(label="face_scale", minimum=0, maximum=1, value=default_values["face_scale"])
         step_T = gr.Slider(label="step_T", minimum=1, maximum=100, step=1, value=default_values["step_T"])
     generate_button.click(
         generate_video,
         inputs=[
             uploaded_img, uploaded_audio, infer_type,
+            pose_yaw, pose_pitch, pose_roll, face_location, face_scale, step_T, face_sr, seed
         ],
         outputs=[output_video_256, output_video_512, output_message]
     )