Spaces:
Running
Running
File size: 6,522 Bytes
96b6a47 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import os, random, json
import numpy as np
from pydub import AudioSegment
from pydub.utils import make_chunks
from pydub.effects import compress_dynamic_range
from PIL import Image
import cv2
from moviepy.editor import VideoClip, AudioFileClip
import gradio as gr
# Load configuration
def load_config(config_path):
with open(config_path, 'r') as config_file:
return json.load(config_file)
def process_audio_and_generate_video(config_path, audio_file):
config = load_config(config_path)
# Load the images
closed_mouth_img = Image.open(config['frame_paths']['closed_mouth'])
open_mouth_img = Image.open(config['frame_paths']['open_mouth'])
closed_mouth_blinking_img = Image.open(config['frame_paths']['closed_mouth_blinking'])
open_mouth_blinking_img = Image.open(config['frame_paths']['open_mouth_blinking'])
# Create a background with the color from config
background_color = tuple(config['background_color'])
background = Image.new('RGBA', closed_mouth_img.size, background_color)
# Composite the images with the background
closed_mouth_img = Image.alpha_composite(background, closed_mouth_img)
open_mouth_img = Image.alpha_composite(background, open_mouth_img)
closed_mouth_blinking_img = Image.alpha_composite(background, closed_mouth_blinking_img)
open_mouth_blinking_img = Image.alpha_composite(background, open_mouth_blinking_img)
# Convert images to OpenCV format
closed_mouth_cv = cv2.cvtColor(np.array(closed_mouth_img), cv2.COLOR_RGBA2RGB)
open_mouth_cv = cv2.cvtColor(np.array(open_mouth_img), cv2.COLOR_RGBA2RGB)
closed_mouth_blinking_cv = cv2.cvtColor(np.array(closed_mouth_blinking_img), cv2.COLOR_RGBA2RGB)
open_mouth_blinking_cv = cv2.cvtColor(np.array(open_mouth_blinking_img), cv2.COLOR_RGBA2RGB)
# Set parameters
frame_rate = config['frame_rate']
frame_duration_ms = config['frame_duration_ms'] // frame_rate
# Load the audio
audio = AudioSegment.from_file(audio_file)
# Apply compression
compressed_audio = compress_dynamic_range(audio, threshold=-20.0, ratio=8.0, attack=1.0, release=10.0)
# Normalize audio
target_dBFS = -10.0
change_in_dBFS = target_dBFS - compressed_audio.dBFS
normalized_audio = compressed_audio.apply_gain(change_in_dBFS)
# Split the audio into chunks of the same duration as the frames
audio_chunks = make_chunks(normalized_audio, frame_duration_ms)
# Function to calculate decibels of a chunk
def calculate_decibels(chunk):
return chunk.dBFS
# Decide whether to use dynamic threshold or a fixed threshold
if config["dynamic_threshold"] == 1:
# Calculate average decibels
average_dBFS = sum(chunk.dBFS for chunk in audio_chunks) / len(audio_chunks)
decibel_threshold = average_dBFS + 4 # Set threshold above average
else:
decibel_threshold = config['decibel_threshold']
# Blink logic
blink_duration = config['blink_duration']
last_blink_time = config['initial_blink_time']
# Decide whether to blink
def should_blink(t, last_blink_time):
if t - last_blink_time > random.uniform(config['minimum_blinking_delay'], config['maximum_blinking_delay']):
return True
return False
# Function to generate frames
def make_frame(t):
nonlocal last_blink_time
frame_index = int(t * frame_rate)
if should_blink(t, last_blink_time):
last_blink_time = t
if 0 <= (t - last_blink_time) <= blink_duration:
if frame_index < len(audio_chunks):
chunk = audio_chunks[frame_index]
decibels = calculate_decibels(chunk)
return open_mouth_blinking_cv if decibels > decibel_threshold else closed_mouth_blinking_cv
else:
return closed_mouth_blinking_cv
if frame_index < len(audio_chunks):
chunk = audio_chunks[frame_index]
decibels = calculate_decibels(chunk)
return open_mouth_cv if decibels > decibel_threshold else closed_mouth_cv
else:
return closed_mouth_cv
# Create a video clip
video_clip = VideoClip(make_frame, duration=len(audio_chunks) / frame_rate)
# Load the audio
audio_clip = AudioFileClip(audio_file)
# Set the audio of the video to the loaded audio
video_with_audio = video_clip.set_audio(audio_clip)
# Write the final video with audio
output_video_path = os.path.join(config['output_path'], f"{os.path.basename(audio_file).split('.')[0]}.mp4")
video_with_audio.write_videofile(output_video_path, fps=frame_rate, codec=config['codec'], audio_codec=config["audio_codec"])
return output_video_path
html_content = """
<h3>How to Use</h3>
<p>Add 1-4 images in the <b>frames</b> folder and modify the paths in the <b>config.json</b> to use the images you want.<br>
Put the audios into the <b>audio</b> folder. It will create as many animations as there are audios.</p>
<h3>Frame Images:</h3>
<table>
<tr>
<th>Closed Mouth</th>
<th>Closed Mouth Blinking</th>
<th>Open Mouth</th>
<th>Open Mouth Blinking</th>
</tr>
<tr>
<td><img src="https://github.com/user-attachments/assets/3ed0c597-df0e-4165-98d4-cf978e1338bb" alt="closed_mouth" width="150"/></td>
<td><img src="https://github.com/user-attachments/assets/1296c2a7-4304-4935-b398-4ee5e1fe8a10" alt="closed_mouth_blinking" width="150"/></td>
<td><img src="https://github.com/user-attachments/assets/4715a73a-1a27-4ac9-a20b-954dde0aac0b" alt="open_mouth" width="150"/></td>
<td><img src="https://github.com/user-attachments/assets/b7d04648-9158-4dd2-889c-27c67a64e0b2" alt="open_mouth_blinking" width="150"/></td>
</tr>
</table>
<a href="https://github.com/user-attachments/assets/dcf3728c-0d3b-455d-b17e-5e9819be069b">Download the assets here</a>
"""
# Gradio interface
def gradio_interface(config_file, audio_file):
video_path = process_audio_and_generate_video(config_file, audio_file)
return video_path
with gr.Blocks() as demo:
gr.HTML(html_content)
config_file_input = gr.File(label="Upload Config File (JSON)")
audio_file_input = gr.Audio(label="Upload Audio File", type="filepath")
output_video = gr.Video(label="Generated Video")
generate_button = gr.Button("Generate Animation")
generate_button.click(gradio_interface, [config_file_input, audio_file_input], output_video)
demo.launch()
|