import os
import cv2
import numpy as np
from moviepy.editor import VideoFileClip, CompositeVideoClip, ColorClip
from tqdm import tqdm
import glob
import concurrent.futures
import time
import random

def create_text_overlay(text, subtitle, width, height, start_time, duration):
    overlay = np.zeros((height, width, 4), dtype=np.uint8)
    
    box_width = int(width * 0.75)
    box_x_start = (width - box_width) // 2
    
    cv2.rectangle(overlay, 
                 (box_x_start, height//3), 
                 (box_x_start + box_width, 2*height//3), 
                 (0,0,0,180), -1)
    
    font = cv2.FONT_HERSHEY_DUPLEX
    
    if "AirLetters" in text:
        title_scale = 3.0
        subtitle_scale = 1.5
        
        envelope_emoji = cv2.imread("emoji/envelope.png", cv2.IMREAD_UNCHANGED)
        wind_emoji = cv2.imread("emoji/wind.png", cv2.IMREAD_UNCHANGED)
        
        target_height = int(title_scale * 30)
        envelope_aspect = envelope_emoji.shape[1] / envelope_emoji.shape[0]
        wind_aspect = wind_emoji.shape[1] / wind_emoji.shape[0]
        
        envelope_emoji = cv2.resize(envelope_emoji, 
                                  (int(target_height * envelope_aspect), target_height))
        wind_emoji = cv2.resize(wind_emoji, 
                              (int(target_height * wind_aspect), target_height))
    else:
        title_scale = 2.0
        subtitle_scale = 1.0
    
    title_color = (138, 223, 178, 255)
    subtitle_color = (255, 255, 255, 255)
    
    title_size = cv2.getTextSize(text, font, title_scale, 2)[0]
    
    # Center text within the box
    title_x = box_x_start + (box_width - title_size[0]) // 2
    title_y = height // 2
    
    if "AirLetters" in text:
        emoji_y = title_y - target_height + 5
        envelope_x = title_x - envelope_emoji.shape[1] - 20
        wind_x = title_x + title_size[0] + 20
        
        def overlay_image_with_alpha(background, foreground, x, y):
            if x >= background.shape[1] or y >= background.shape[0]:
                return
                
            h, w = foreground.shape[:2]
            if len(foreground.shape) == 2:
                alpha = foreground
                foreground = cv2.cvtColor(foreground, cv2.COLOR_GRAY2BGR)
            else:
                alpha = foreground[:, :, 3] / 255.0
                foreground = foreground[:, :, :3]
            
            y1, y2 = max(0, y), min(background.shape[0], y + h)
            x1, x2 = max(0, x), min(background.shape[1], x + w)
            
            alpha_slice = alpha[y1-y:y2-y, x1-x:x2-x]
            alpha_expanded = np.expand_dims(alpha_slice, axis=-1)
            
            background_slice = background[y1:y2, x1:x2, :3]
            foreground_slice = foreground[y1-y:y2-y, x1-x:x2-x]
            
            background[y1:y2, x1:x2, :3] = background_slice * (1 - alpha_expanded) + foreground_slice * alpha_expanded
            
            background[y1:y2, x1:x2, 3] = background[y1:y2, x1:x2, 3] * (1 - alpha_slice) + 255 * alpha_slice
        
        overlay_image_with_alpha(overlay, envelope_emoji, envelope_x, emoji_y)
        overlay_image_with_alpha(overlay, wind_emoji, wind_x, emoji_y)
    else:
        if len(subtitle) > 50:
            words = subtitle.split()
            mid = len(words) // 2
            subtitle = " ".join(words[:mid]) + "\n" + " ".join(words[mid:])
    
    title_size = cv2.getTextSize(text, font, title_scale, 2)[0]
    
    title_x = box_x_start + (box_width - title_size[0]) // 2
    title_y = height // 2
    
    cv2.putText(overlay, text, (title_x, title_y), font, title_scale, title_color, 2)
    
    if "\n" in subtitle:
        subtitle_lines = subtitle.split("\n")
        subtitle_y = title_y + 50
        for line in subtitle_lines:
            subtitle_size = cv2.getTextSize(line, font, subtitle_scale, 2)[0]
            subtitle_x = box_x_start + (box_width - subtitle_size[0]) // 2
            cv2.putText(overlay, line, (subtitle_x, subtitle_y), font, subtitle_scale, subtitle_color, 2)
            subtitle_y += 50
    else:
        subtitle_size = cv2.getTextSize(subtitle, font, subtitle_scale, 2)[0]
        subtitle_x = box_x_start + (box_width - subtitle_size[0]) // 2
        cv2.putText(overlay, subtitle, (subtitle_x, title_y + 60), font, subtitle_scale, subtitle_color, 2)
    
    overlay_clip = ColorClip(size=(width, height), color=[0,0,0,0])
    overlay_clip.mask = ColorClip(size=(width, height), color=[1,1,1,1])
    overlay_clip.mask.get_frame = lambda t: overlay[:,:,3:4] / 255.0
    overlay_clip.get_frame = lambda t: overlay[:,:,:3]
    
    overlay_clip = overlay_clip.set_start(start_time)
    overlay_clip = overlay_clip.set_duration(duration)
    overlay_clip = overlay_clip.fadein(0.5).fadeout(0.5)
    
    return overlay_clip

def load_video(args):
    video_path, target_size, padding, idx, grid_width = args
    try:
        clip = VideoFileClip(video_path, audio=False)
        
        clip = clip.resize(height=target_size)
        clip = clip.crop(x1=(clip.w - target_size)//2, x2=(clip.w + target_size)//2) if clip.w > target_size else clip
        clip = clip.loop()
        
        bg = ColorClip(size=(target_size + padding*2, target_size + padding*2), color=(255,255,255))
        clip = clip.set_position((padding, padding))
        clip = CompositeVideoClip([bg, clip])
        
        x = (idx % grid_width) * (target_size + padding*2)
        y = (idx // grid_width) * (target_size + padding*2)
        
        clip = clip.set_position((x, y))
        return clip
    except Exception as e:
        print(f"\nError processing {video_path}: {str(e)}")
        return None

def create_montage(video_dir, output_path, width=1920, height=1080, fps=30):
    print("Starting video creation...")
    start_time = time.time()
    
    TOTAL_DURATION = 15
    FIRST_PHASE = 5
    TRANSITION = 5
    FINAL_PHASE = 5
    
    video_paths = glob.glob(os.path.join(video_dir, "*.mp4"))
    
    base_grid_videos = 400
    aspect_ratio = 16/9
    grid_width = int(np.sqrt(base_grid_videos * aspect_ratio))
    grid_height = int(np.sqrt(base_grid_videos / aspect_ratio))
    
    padding = 1
    target_size = min(width // grid_width, height // grid_height) - padding*2
    
    print(f"Creating grid of {grid_width}x{grid_height} videos")
    print(f"Video size: {target_size}x{target_size} pixels")
    
    needed_videos = grid_width * grid_height
    if len(video_paths) > needed_videos:
        video_paths = random.sample(video_paths, needed_videos)
    
    args_list = [(path, target_size, padding, idx, grid_width) 
                 for idx, path in enumerate(video_paths)]
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        futures = list(tqdm(
            executor.map(load_video, args_list),
            total=len(args_list),
            desc="Loading videos"
        ))
        clips = [clip for clip in futures if clip is not None]
    
    if not clips:
        raise ValueError("No videos were successfully loaded!")
    
    bg = ColorClip((width, height), color=(0, 0, 0))
    video_clips = [bg] + clips
    
    print("Creating video composition...")
    video_comp = CompositeVideoClip(video_clips, size=(width, height))
    
    w, h = video_comp.size
    def get_zoom_crop(t):
        if t < FIRST_PHASE:
            return (w, h)
        elif t < FIRST_PHASE + TRANSITION:
            progress = (t - FIRST_PHASE) / TRANSITION
            zoom_factor = 1 + (progress * 2)
        else:
            zoom_factor = 3
        return (int(w/zoom_factor), int(h/zoom_factor))
    
    def apply_zoom(gf, t):
        frame = gf(t)
        cw, ch = get_zoom_crop(t)
        if cw >= w or ch >= h:
            return frame
        x = (w - cw) // 2
        y = (h - ch) // 2
        cropped = frame[y:y+ch, x:x+cw]
        return cv2.resize(cropped, (w, h), interpolation=cv2.INTER_LINEAR)
    
    video_comp = video_comp.fl(apply_zoom)
    video_comp = video_comp.set_duration(TOTAL_DURATION)
    
    text1 = create_text_overlay(
        "AirLetters",
        "\nAn Open Video Dataset of Characters Drawn in the Air",
        width, height, 0, FIRST_PHASE
    )
    
    text2 = create_text_overlay(
        "Novel Video Understanding Benchmark",
        "for evaluating the ability to understand articulated motions which requires very strong temporal capabilities, a task very challenging for current models",
        width, height, FIRST_PHASE + TRANSITION, FINAL_PHASE
    )
    
    final = CompositeVideoClip([video_comp, text1, text2])
    
    print("Writing final video...")
    final.write_videofile(
        output_path,
        fps=fps,
        codec='libx264',
        audio=False,
        threads=16,
        logger='bar'
    )
    print("Cleaning up...")
    final.close()
    for clip in clips:
        if clip is not None:
            clip.close()
    
    print(f"\nTotal processing time: {time.time() - start_time:.2f} seconds")
    print(f"Output saved to: {output_path}")

if __name__ == "__main__":
    create_montage(
        video_dir="airletters/videos",
        output_path="30fps.mp4",
        fps=30,
    )