File size: 6,522 Bytes
96b6a47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import os, random, json
import numpy as np
from pydub import AudioSegment
from pydub.utils import make_chunks
from pydub.effects import compress_dynamic_range
from PIL import Image
import cv2
from moviepy.editor import VideoClip, AudioFileClip
import gradio as gr

# Load configuration
def load_config(config_path):
    with open(config_path, 'r') as config_file:
        return json.load(config_file)

def process_audio_and_generate_video(config_path, audio_file):
    config = load_config(config_path)

    # Load the images
    closed_mouth_img = Image.open(config['frame_paths']['closed_mouth'])
    open_mouth_img = Image.open(config['frame_paths']['open_mouth'])
    closed_mouth_blinking_img = Image.open(config['frame_paths']['closed_mouth_blinking'])
    open_mouth_blinking_img = Image.open(config['frame_paths']['open_mouth_blinking'])

    # Create a background with the color from config
    background_color = tuple(config['background_color'])
    background = Image.new('RGBA', closed_mouth_img.size, background_color)

    # Composite the images with the background
    closed_mouth_img = Image.alpha_composite(background, closed_mouth_img)
    open_mouth_img = Image.alpha_composite(background, open_mouth_img)
    closed_mouth_blinking_img = Image.alpha_composite(background, closed_mouth_blinking_img)
    open_mouth_blinking_img = Image.alpha_composite(background, open_mouth_blinking_img)

    # Convert images to OpenCV format
    closed_mouth_cv = cv2.cvtColor(np.array(closed_mouth_img), cv2.COLOR_RGBA2RGB)
    open_mouth_cv = cv2.cvtColor(np.array(open_mouth_img), cv2.COLOR_RGBA2RGB)
    closed_mouth_blinking_cv = cv2.cvtColor(np.array(closed_mouth_blinking_img), cv2.COLOR_RGBA2RGB)
    open_mouth_blinking_cv = cv2.cvtColor(np.array(open_mouth_blinking_img), cv2.COLOR_RGBA2RGB)

    # Set parameters
    frame_rate = config['frame_rate']
    frame_duration_ms = config['frame_duration_ms'] // frame_rate

    # Load the audio
    audio = AudioSegment.from_file(audio_file)

    # Apply compression
    compressed_audio = compress_dynamic_range(audio, threshold=-20.0, ratio=8.0, attack=1.0, release=10.0)
    
    # Normalize audio
    target_dBFS = -10.0
    change_in_dBFS = target_dBFS - compressed_audio.dBFS
    normalized_audio = compressed_audio.apply_gain(change_in_dBFS)

    # Split the audio into chunks of the same duration as the frames
    audio_chunks = make_chunks(normalized_audio, frame_duration_ms)

    # Function to calculate decibels of a chunk
    def calculate_decibels(chunk):
        return chunk.dBFS

    # Decide whether to use dynamic threshold or a fixed threshold
    if config["dynamic_threshold"] == 1:
        # Calculate average decibels
        average_dBFS = sum(chunk.dBFS for chunk in audio_chunks) / len(audio_chunks)
        decibel_threshold = average_dBFS + 4  # Set threshold above average
    else:
        decibel_threshold = config['decibel_threshold']
    
    # Blink logic
    blink_duration = config['blink_duration']
    last_blink_time = config['initial_blink_time']

    # Decide whether to blink
    def should_blink(t, last_blink_time):
        if t - last_blink_time > random.uniform(config['minimum_blinking_delay'], config['maximum_blinking_delay']):
            return True
        return False

    # Function to generate frames
    def make_frame(t):
        nonlocal last_blink_time
        frame_index = int(t * frame_rate)

        if should_blink(t, last_blink_time):
            last_blink_time = t

        if 0 <= (t - last_blink_time) <= blink_duration:
            if frame_index < len(audio_chunks):
                chunk = audio_chunks[frame_index]
                decibels = calculate_decibels(chunk)
                
                return open_mouth_blinking_cv if decibels > decibel_threshold else closed_mouth_blinking_cv
            else:
                return closed_mouth_blinking_cv
        
        if frame_index < len(audio_chunks):
            chunk = audio_chunks[frame_index]
            decibels = calculate_decibels(chunk)
            
            return open_mouth_cv if decibels > decibel_threshold else closed_mouth_cv
        else:
            return closed_mouth_cv

    # Create a video clip
    video_clip = VideoClip(make_frame, duration=len(audio_chunks) / frame_rate)

    # Load the audio
    audio_clip = AudioFileClip(audio_file)

    # Set the audio of the video to the loaded audio
    video_with_audio = video_clip.set_audio(audio_clip)

    # Write the final video with audio
    output_video_path = os.path.join(config['output_path'], f"{os.path.basename(audio_file).split('.')[0]}.mp4")
    video_with_audio.write_videofile(output_video_path, fps=frame_rate, codec=config['codec'], audio_codec=config["audio_codec"])

    return output_video_path


html_content = """
<h3>How to Use</h3>
<p>Add 1-4 images in the <b>frames</b> folder and modify the paths in the <b>config.json</b> to use the images you want.<br>
Put the audios into the <b>audio</b> folder. It will create as many animations as there are audios.</p>

<h3>Frame Images:</h3>
<table>
  <tr>
    <th>Closed Mouth</th>
    <th>Closed Mouth Blinking</th>
    <th>Open Mouth</th>
    <th>Open Mouth Blinking</th>
  </tr>
  <tr>
    <td><img src="https://github.com/user-attachments/assets/3ed0c597-df0e-4165-98d4-cf978e1338bb" alt="closed_mouth" width="150"/></td>
    <td><img src="https://github.com/user-attachments/assets/1296c2a7-4304-4935-b398-4ee5e1fe8a10" alt="closed_mouth_blinking" width="150"/></td>
    <td><img src="https://github.com/user-attachments/assets/4715a73a-1a27-4ac9-a20b-954dde0aac0b" alt="open_mouth" width="150"/></td>
    <td><img src="https://github.com/user-attachments/assets/b7d04648-9158-4dd2-889c-27c67a64e0b2" alt="open_mouth_blinking" width="150"/></td>
  </tr>
</table>


<a href="https://github.com/user-attachments/assets/dcf3728c-0d3b-455d-b17e-5e9819be069b">Download the assets here</a>
"""




# Gradio interface
def gradio_interface(config_file, audio_file):
    video_path = process_audio_and_generate_video(config_file, audio_file)
    return video_path

with gr.Blocks() as demo:
    gr.HTML(html_content)
    config_file_input = gr.File(label="Upload Config File (JSON)")
    audio_file_input = gr.Audio(label="Upload Audio File", type="filepath")
    output_video = gr.Video(label="Generated Video")
    
    generate_button = gr.Button("Generate Animation")
    generate_button.click(gradio_interface, [config_file_input, audio_file_input], output_video)

demo.launch()