import os
import math
import gradio as gr
import numpy as np
import torch
import safetensors.torch as sf
import db_examples
import datetime
from pathlib import Path
from io import BytesIO

from PIL import Image
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler, EulerAncestralDiscreteScheduler, DPMSolverMultistepScheduler
from diffusers.models.attention_processor import AttnProcessor2_0
from transformers import CLIPTextModel, CLIPTokenizer
from briarmbg import BriaRMBG
from enum import Enum
from torch.hub import download_url_to_file

from torch.hub import download_url_to_file
import cv2

from typing import Optional

from Depth.depth_anything_v2.dpt import DepthAnythingV2



# from FLORENCE
import spaces
import supervision as sv
import torch
from PIL import Image

from utils.sam import load_sam_image_model, run_sam_inference


try:
    import xformers
    import xformers.ops
    XFORMERS_AVAILABLE = True
    print("xformers is available - Using memory efficient attention")
except ImportError:
    XFORMERS_AVAILABLE = False
    print("xformers not available - Using default attention")

# Memory optimizations for RTX 2070
torch.backends.cudnn.benchmark = True
if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    # Set a smaller attention slice size for RTX 2070
    torch.backends.cuda.max_split_size_mb = 512
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# 'stablediffusionapi/realistic-vision-v51'
# 'runwayml/stable-diffusion-v1-5'
sd15_name = 'stablediffusionapi/realistic-vision-v51'
tokenizer = CLIPTokenizer.from_pretrained(sd15_name, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(sd15_name, subfolder="text_encoder")
vae = AutoencoderKL.from_pretrained(sd15_name, subfolder="vae")
# unet = None
unet = UNet2DConditionModel.from_pretrained(sd15_name, subfolder="unet")
rmbg = BriaRMBG.from_pretrained("briaai/RMBG-1.4")

model = DepthAnythingV2(encoder='vits', features=64, out_channels=[48, 96, 192, 384])
model.load_state_dict(torch.load('checkpoints/depth_anything_v2_vits.pth', map_location=device))
model = model.to(device)
model.eval()

def initialize_pipeline_fc():
    global unet, unet_original_forward, t2i_pipe, i2i_pipe, current_model
    # Clear previous models
    if unet is not None:
        del unet
        unet = None
    if t2i_pipe is not None:
        del t2i_pipe
        t2i_pipe = None
    if i2i_pipe is not None:
        del i2i_pipe
        i2i_pipe = None
    clear_memory()

    # Load UNet model
    unet = UNet2DConditionModel.from_pretrained(sd15_name, subfolder="unet")

    # Modify UNet to have 8 input channels
    with torch.no_grad():
        new_conv_in = torch.nn.Conv2d(8, unet.conv_in.out_channels,
                                      unet.conv_in.kernel_size, unet.conv_in.stride,
                                      unet.conv_in.padding)
        new_conv_in.weight.zero_()
        new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
        new_conv_in.bias = unet.conv_in.bias
        unet.conv_in = new_conv_in

    # Load model weights
    sd_offset = sf.load_file(model_path_fc)
    sd_origin = unet.state_dict()
    sd_merged = {k: sd_origin[k] + sd_offset[k] for k in sd_origin.keys()}
    unet.load_state_dict(sd_merged, strict=True)
    del sd_offset, sd_origin, sd_merged

    # Set up forward function
    unet_original_forward = unet.forward
    unet.forward = hooked_unet_forward

    # Move UNet to device
    unet = unet.to(device=device, dtype=dtype)
    enable_efficient_attention()

    # Create pipelines
    t2i_pipe = StableDiffusionPipeline(
        vae=vae,
        text_encoder=text_encoder,
        tokenizer=tokenizer,
        unet=unet,
        scheduler=dpmpp_2m_sde_karras_scheduler,
        safety_checker=None,
        feature_extractor=None
    )
    i2i_pipe = StableDiffusionImg2ImgPipeline(
        vae=vae,
        text_encoder=text_encoder,
        tokenizer=tokenizer,
        unet=unet,
        scheduler=dpmpp_2m_sde_karras_scheduler,
        safety_checker=None,
        feature_extractor=None
    )

    current_model = 'fc'

def initialize_pipeline_fbc():
    global unet, unet_original_forward, t2i_pipe, i2i_pipe, current_model
    # Clear previous models
    if unet is not None:
        del unet
        unet = None
    if t2i_pipe is not None:
        del t2i_pipe
        t2i_pipe = None
    if i2i_pipe is not None:
        del i2i_pipe
        i2i_pipe = None
    clear_memory()

    # Load UNet model
    unet = UNet2DConditionModel.from_pretrained(sd15_name, subfolder="unet")

    # Modify UNet to have 12 input channels
    with torch.no_grad():
        new_conv_in = torch.nn.Conv2d(12, unet.conv_in.out_channels,
                                      unet.conv_in.kernel_size, unet.conv_in.stride,
                                      unet.conv_in.padding)
        new_conv_in.weight.zero_()
        new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
        new_conv_in.bias = unet.conv_in.bias
        unet.conv_in = new_conv_in

    # Load model weights
    sd_offset = sf.load_file(model_path_fbc)
    sd_origin = unet.state_dict()
    sd_merged = {k: sd_origin[k] + sd_offset[k] for k in sd_origin.keys()}
    unet.load_state_dict(sd_merged, strict=True)
    del sd_offset, sd_origin, sd_merged

    # Set up forward function
    unet_original_forward = unet.forward
    unet.forward = hooked_unet_forward

    # Move UNet to device
    unet = unet.to(device=device, dtype=dtype)
    enable_efficient_attention()

    # Create pipelines
    t2i_pipe = StableDiffusionPipeline(
        vae=vae,
        text_encoder=text_encoder,
        tokenizer=tokenizer,
        unet=unet,
        scheduler=dpmpp_2m_sde_karras_scheduler,
        safety_checker=None,
        feature_extractor=None
    )
    i2i_pipe = StableDiffusionImg2ImgPipeline(
        vae=vae,
        text_encoder=text_encoder,
        tokenizer=tokenizer,
        unet=unet,
        scheduler=dpmpp_2m_sde_karras_scheduler,
        safety_checker=None,
        feature_extractor=None
    )

    current_model = 'fbc'

# Change UNet

# with torch.no_grad():
#     new_conv_in = torch.nn.Conv2d(8, unet.conv_in.out_channels, unet.conv_in.kernel_size, unet.conv_in.stride, unet.conv_in.padding)
#     new_conv_in.weight.zero_()
#     new_conv_in.weight[:, :4, :, :].copy_(unet.conv_in.weight)
#     new_conv_in.bias = unet.conv_in.bias
#     unet.conv_in = new_conv_in


# unet_original_forward = unet.forward


def enable_efficient_attention():
    if XFORMERS_AVAILABLE:
        try:
            # RTX 2070 specific settings
            unet.set_use_memory_efficient_attention_xformers(True)
            vae.set_use_memory_efficient_attention_xformers(True)
            print("Enabled xformers memory efficient attention")
        except Exception as e:
            print(f"Xformers error: {e}")
            print("Falling back to sliced attention")
            # Use sliced attention for RTX 2070
            unet.set_attention_slice_size(4)
            vae.set_attention_slice_size(4)
            unet.set_attn_processor(AttnProcessor2_0())
            vae.set_attn_processor(AttnProcessor2_0())
    else:
        # Fallback for when xformers is not available
        print("Using sliced attention")
        unet.set_attention_slice_size(4)
        vae.set_attention_slice_size(4)
        unet.set_attn_processor(AttnProcessor2_0())
        vae.set_attn_processor(AttnProcessor2_0())

# Add memory clearing function
def clear_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

# Enable efficient attention
enable_efficient_attention()


def hooked_unet_forward(sample, timestep, encoder_hidden_states, **kwargs):
    c_concat = kwargs['cross_attention_kwargs']['concat_conds'].to(sample)
    c_concat = torch.cat([c_concat] * (sample.shape[0] // c_concat.shape[0]), dim=0)
    new_sample = torch.cat([sample, c_concat], dim=1)

    # Pad new_sample to have 12 channels
    if new_sample.shape[1] < 12:
        padding_channels = 12 - new_sample.shape[1]
        padding = torch.zeros(
            new_sample.shape[0],
            padding_channels,
            new_sample.shape[2],
            new_sample.shape[3],
            device=new_sample.device,
            dtype=new_sample.dtype
        )
        new_sample = torch.cat([new_sample, padding], dim=1)

    kwargs['cross_attention_kwargs'] = {}
    return unet_original_forward(new_sample, timestep, encoder_hidden_states, **kwargs)


unet.forward = hooked_unet_forward

# Load
model_path_fc = './models/iclight_sd15_fc.safetensors'   # Foreground Conditioned model
model_path_fbc = './models/iclight_sd15_fbc.safetensors' # Foreground and Background Conditioned model



# if not os.path.exists(model_path):
#     download_url_to_file(url='https://huggingface.co/lllyasviel/ic-light/resolve/main/iclight_sd15_fc.safetensors', dst=model_path)

sd_offset = sf.load_file(model_path_fc)
sd_origin = unet.state_dict()
keys = sd_origin.keys()
sd_merged = {k: sd_origin[k] + sd_offset[k] for k in sd_origin.keys()}
unet.load_state_dict(sd_merged, strict=True)
del sd_offset, sd_origin, sd_merged, keys

# Device

# device = torch.device('cuda')
# text_encoder = text_encoder.to(device=device, dtype=torch.float16)
# vae = vae.to(device=device, dtype=torch.bfloat16)
# unet = unet.to(device=device, dtype=torch.float16)
# rmbg = rmbg.to(device=device, dtype=torch.float32)


# Device and dtype setup
device = torch.device('cuda')
dtype = torch.float16  # RTX 2070 works well with float16

# Memory optimizations for RTX 2070
torch.backends.cudnn.benchmark = True
if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    # Set a very small attention slice size for RTX 2070 to avoid OOM
    torch.backends.cuda.max_split_size_mb = 128

# Move models to device with consistent dtype
text_encoder = text_encoder.to(device=device, dtype=dtype)
vae = vae.to(device=device, dtype=dtype)  # Changed from bfloat16 to float16
unet = unet.to(device=device, dtype=dtype)
rmbg = rmbg.to(device=device, dtype=torch.float32)  # Keep this as float32


ddim_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)

euler_a_scheduler = EulerAncestralDiscreteScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    steps_offset=1
)

dpmpp_2m_sde_karras_scheduler = DPMSolverMultistepScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    algorithm_type="sde-dpmsolver++",
    use_karras_sigmas=True,
    steps_offset=1
)

# Pipelines

t2i_pipe = StableDiffusionPipeline(
    vae=vae,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=dpmpp_2m_sde_karras_scheduler,
    safety_checker=None,
    requires_safety_checker=False,
    feature_extractor=None,
    image_encoder=None
)

i2i_pipe = StableDiffusionImg2ImgPipeline(
    vae=vae,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=dpmpp_2m_sde_karras_scheduler,
    safety_checker=None,
    requires_safety_checker=False,
    feature_extractor=None,
    image_encoder=None
)


@torch.inference_mode()
def encode_prompt_inner(txt: str):
    max_length = tokenizer.model_max_length
    chunk_length = tokenizer.model_max_length - 2
    id_start = tokenizer.bos_token_id
    id_end = tokenizer.eos_token_id
    id_pad = id_end

    def pad(x, p, i):
        return x[:i] if len(x) >= i else x + [p] * (i - len(x))

    tokens = tokenizer(txt, truncation=False, add_special_tokens=False)["input_ids"]
    chunks = [[id_start] + tokens[i: i + chunk_length] + [id_end] for i in range(0, len(tokens), chunk_length)]
    chunks = [pad(ck, id_pad, max_length) for ck in chunks]

    token_ids = torch.tensor(chunks).to(device=device, dtype=torch.int64)
    conds = text_encoder(token_ids).last_hidden_state

    return conds


@torch.inference_mode()
def encode_prompt_pair(positive_prompt, negative_prompt):
    c = encode_prompt_inner(positive_prompt)
    uc = encode_prompt_inner(negative_prompt)

    c_len = float(len(c))
    uc_len = float(len(uc))
    max_count = max(c_len, uc_len)
    c_repeat = int(math.ceil(max_count / c_len))
    uc_repeat = int(math.ceil(max_count / uc_len))
    max_chunk = max(len(c), len(uc))

    c = torch.cat([c] * c_repeat, dim=0)[:max_chunk]
    uc = torch.cat([uc] * uc_repeat, dim=0)[:max_chunk]

    c = torch.cat([p[None, ...] for p in c], dim=1)
    uc = torch.cat([p[None, ...] for p in uc], dim=1)

    return c, uc


@torch.inference_mode()
def pytorch2numpy(imgs, quant=True):
    results = []
    for x in imgs:
        y = x.movedim(0, -1)

        if quant:
            y = y * 127.5 + 127.5
            y = y.detach().float().cpu().numpy().clip(0, 255).astype(np.uint8)
        else:
            y = y * 0.5 + 0.5
            y = y.detach().float().cpu().numpy().clip(0, 1).astype(np.float32)

        results.append(y)
    return results


@torch.inference_mode()
def numpy2pytorch(imgs):
    h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.0 - 1.0  # so that 127 must be strictly 0.0
    h = h.movedim(-1, 1)
    return h


def resize_and_center_crop(image, target_width, target_height):
    pil_image = Image.fromarray(image)
    original_width, original_height = pil_image.size
    scale_factor = max(target_width / original_width, target_height / original_height)
    resized_width = int(round(original_width * scale_factor))
    resized_height = int(round(original_height * scale_factor))
    resized_image = pil_image.resize((resized_width, resized_height), Image.LANCZOS)
    left = (resized_width - target_width) / 2
    top = (resized_height - target_height) / 2
    right = (resized_width + target_width) / 2
    bottom = (resized_height + target_height) / 2
    cropped_image = resized_image.crop((left, top, right, bottom))
    return np.array(cropped_image)


def resize_without_crop(image, target_width, target_height):
    pil_image = Image.fromarray(image)
    resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
    return np.array(resized_image)


@torch.inference_mode()
def run_rmbg(img, sigma=0.0):
    # Convert RGBA to RGB if needed
    if img.shape[-1] == 4:
        # Use white background for alpha composition
        alpha = img[..., 3:] / 255.0
        rgb = img[..., :3]
        white_bg = np.ones_like(rgb) * 255
        img = (rgb * alpha + white_bg * (1 - alpha)).astype(np.uint8)
    
    H, W, C = img.shape
    assert C == 3
    k = (256.0 / float(H * W)) ** 0.5
    feed = resize_without_crop(img, int(64 * round(W * k)), int(64 * round(H * k)))
    feed = numpy2pytorch([feed]).to(device=device, dtype=torch.float32)
    alpha = rmbg(feed)[0][0]
    alpha = torch.nn.functional.interpolate(alpha, size=(H, W), mode="bilinear")
    alpha = alpha.movedim(1, -1)[0]
    alpha = alpha.detach().float().cpu().numpy().clip(0, 1)
    
    # Create RGBA image
    rgba = np.dstack((img, alpha * 255)).astype(np.uint8)
    result = 127 + (img.astype(np.float32) - 127 + sigma) * alpha
    return result.clip(0, 255).astype(np.uint8), rgba
@torch.inference_mode()
def process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
    clear_memory()

    # Get input dimensions
    input_height, input_width = input_fg.shape[:2]

    bg_source = BGSource(bg_source)


    if bg_source == BGSource.UPLOAD:
        pass
    elif bg_source == BGSource.UPLOAD_FLIP:
        input_bg = np.fliplr(input_bg)
    elif bg_source == BGSource.GREY:
        input_bg = np.zeros(shape=(input_height, input_width, 3), dtype=np.uint8) + 64
    elif bg_source == BGSource.LEFT:
        gradient = np.linspace(255, 0, input_width)
        image = np.tile(gradient, (input_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.RIGHT:
        gradient = np.linspace(0, 255, input_width)
        image = np.tile(gradient, (input_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.TOP:
        gradient = np.linspace(255, 0, input_height)[:, None]
        image = np.tile(gradient, (1, input_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.BOTTOM:
        gradient = np.linspace(0, 255, input_height)[:, None]
        image = np.tile(gradient, (1, input_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    else:
        raise 'Wrong initial latent!'

    rng = torch.Generator(device=device).manual_seed(int(seed))

    # Use input dimensions directly
    fg = resize_without_crop(input_fg, input_width, input_height)

    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
    conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt)

    if input_bg is None:
        latents = t2i_pipe(
            prompt_embeds=conds,
            negative_prompt_embeds=unconds,
            width=input_width,
            height=input_height,
            num_inference_steps=steps,
            num_images_per_prompt=num_samples,
            generator=rng,
            output_type='latent',
            guidance_scale=cfg,
            cross_attention_kwargs={'concat_conds': concat_conds},
        ).images.to(vae.dtype) / vae.config.scaling_factor
    else:
        bg = resize_without_crop(input_bg, input_width, input_height)
        bg_latent = numpy2pytorch([bg]).to(device=vae.device, dtype=vae.dtype)
        bg_latent = vae.encode(bg_latent).latent_dist.mode() * vae.config.scaling_factor
        latents = i2i_pipe(
            image=bg_latent,
            strength=lowres_denoise,
            prompt_embeds=conds,
            negative_prompt_embeds=unconds,
            width=input_width,
            height=input_height,
            num_inference_steps=int(round(steps / lowres_denoise)),
            num_images_per_prompt=num_samples,
            generator=rng,
            output_type='latent',
            guidance_scale=cfg,
            cross_attention_kwargs={'concat_conds': concat_conds},
        ).images.to(vae.dtype) / vae.config.scaling_factor

    pixels = vae.decode(latents).sample
    pixels = pytorch2numpy(pixels)
    pixels = [resize_without_crop(
        image=p,
        target_width=int(round(input_width * highres_scale / 64.0) * 64),
        target_height=int(round(input_height * highres_scale / 64.0) * 64))
    for p in pixels]

    pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype)
    latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor
    latents = latents.to(device=unet.device, dtype=unet.dtype)

    highres_height, highres_width = latents.shape[2] * 8, latents.shape[3] * 8

    fg = resize_without_crop(input_fg, highres_width, highres_height)
    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor

    latents = i2i_pipe(
        image=latents,
        strength=highres_denoise,
        prompt_embeds=conds,
        negative_prompt_embeds=unconds,
        width=highres_width,
        height=highres_height,
        num_inference_steps=int(round(steps / highres_denoise)),
        num_images_per_prompt=num_samples,
        generator=rng,
        output_type='latent',
        guidance_scale=cfg,
        cross_attention_kwargs={'concat_conds': concat_conds},
    ).images.to(vae.dtype) / vae.config.scaling_factor

    pixels = vae.decode(latents).sample
    pixels = pytorch2numpy(pixels)
    
    # Resize back to input dimensions
    pixels = [resize_without_crop(p, input_width, input_height) for p in pixels]
    pixels = np.stack(pixels)

    return pixels

@torch.inference_mode()
def process_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source):
    clear_memory()
    bg_source = BGSource(bg_source)

    if bg_source == BGSource.UPLOAD:
        pass
    elif bg_source == BGSource.UPLOAD_FLIP:
        input_bg = np.fliplr(input_bg)
    elif bg_source == BGSource.GREY:
        input_bg = np.zeros(shape=(image_height, image_width, 3), dtype=np.uint8) + 64
    elif bg_source == BGSource.LEFT:
        gradient = np.linspace(224, 32, image_width)
        image = np.tile(gradient, (image_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.RIGHT:
        gradient = np.linspace(32, 224, image_width)
        image = np.tile(gradient, (image_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.TOP:
        gradient = np.linspace(224, 32, image_height)[:, None]
        image = np.tile(gradient, (1, image_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.BOTTOM:
        gradient = np.linspace(32, 224, image_height)[:, None]
        image = np.tile(gradient, (1, image_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    else:
        raise 'Wrong background source!'

    rng = torch.Generator(device=device).manual_seed(seed)

    fg = resize_and_center_crop(input_fg, image_width, image_height)
    bg = resize_and_center_crop(input_bg, image_width, image_height)
    concat_conds = numpy2pytorch([fg, bg]).to(device=vae.device, dtype=vae.dtype)
    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
    concat_conds = torch.cat([c[None, ...] for c in concat_conds], dim=1)

    conds, unconds = encode_prompt_pair(positive_prompt=prompt + ', ' + a_prompt, negative_prompt=n_prompt)

    latents = t2i_pipe(
        prompt_embeds=conds,
        negative_prompt_embeds=unconds,
        width=image_width,
        height=image_height,
        num_inference_steps=steps,
        num_images_per_prompt=num_samples,
        generator=rng,
        output_type='latent',
        guidance_scale=cfg,
        cross_attention_kwargs={'concat_conds': concat_conds},
    ).images.to(vae.dtype) / vae.config.scaling_factor

    pixels = vae.decode(latents).sample
    pixels = pytorch2numpy(pixels)
    pixels = [resize_without_crop(
        image=p,
        target_width=int(round(image_width * highres_scale / 64.0) * 64),
        target_height=int(round(image_height * highres_scale / 64.0) * 64))
    for p in pixels]

    pixels = numpy2pytorch(pixels).to(device=vae.device, dtype=vae.dtype)
    latents = vae.encode(pixels).latent_dist.mode() * vae.config.scaling_factor
    latents = latents.to(device=unet.device, dtype=unet.dtype)

    image_height, image_width = latents.shape[2] * 8, latents.shape[3] * 8
    fg = resize_and_center_crop(input_fg, image_width, image_height)
    bg = resize_and_center_crop(input_bg, image_width, image_height)
    concat_conds = numpy2pytorch([fg, bg]).to(device=vae.device, dtype=vae.dtype)
    concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
    concat_conds = torch.cat([c[None, ...] for c in concat_conds], dim=1)

    latents = i2i_pipe(
        image=latents,
        strength=highres_denoise,
        prompt_embeds=conds,
        negative_prompt_embeds=unconds,
        width=image_width,
        height=image_height,
        num_inference_steps=int(round(steps / highres_denoise)),
        num_images_per_prompt=num_samples,
        generator=rng,
        output_type='latent',
        guidance_scale=cfg,
        cross_attention_kwargs={'concat_conds': concat_conds},
    ).images.to(vae.dtype) / vae.config.scaling_factor

    pixels = vae.decode(latents).sample
    pixels = pytorch2numpy(pixels, quant=False)

    clear_memory()
    return pixels, [fg, bg]


@torch.inference_mode()
def process_relight(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source):
    global current_model
    if current_model != 'fc':
        initialize_pipeline_fc()
    input_fg, matting = run_rmbg(input_fg)
    results = process(input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source)
    return input_fg, results



@torch.inference_mode()
def process_relight_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source):
    global current_model
    if current_model != 'fbc':
        initialize_pipeline_fbc()
    bg_source = BGSource(bg_source)
    
    # Convert numerical inputs to appropriate types
    image_width = int(image_width)
    image_height = int(image_height)
    num_samples = int(num_samples)
    seed = int(seed)
    steps = int(steps)
    cfg = float(cfg)
    highres_scale = float(highres_scale)
    highres_denoise = float(highres_denoise)

    if bg_source == BGSource.UPLOAD:
        pass
    elif bg_source == BGSource.UPLOAD_FLIP:
        input_bg = np.fliplr(input_bg)
    elif bg_source == BGSource.GREY:
        input_bg = np.zeros(shape=(image_height, image_width, 3), dtype=np.uint8) + 64
    elif bg_source == BGSource.LEFT:
        gradient = np.linspace(224, 32, image_width)
        image = np.tile(gradient, (image_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.RIGHT:
        gradient = np.linspace(32, 224, image_width)
        image = np.tile(gradient, (image_height, 1))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.TOP:
        gradient = np.linspace(224, 32, image_height)[:, None]
        image = np.tile(gradient, (1, image_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    elif bg_source == BGSource.BOTTOM:
        gradient = np.linspace(32, 224, image_height)[:, None]
        image = np.tile(gradient, (1, image_width))
        input_bg = np.stack((image,) * 3, axis=-1).astype(np.uint8)
    else:
        raise ValueError('Wrong background source!')
    
    input_fg, matting = run_rmbg(input_fg)
    results, extra_images = process_bg(input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source)
    results = [(x * 255.0).clip(0, 255).astype(np.uint8) for x in results]
    final_results = results + extra_images

    # Save the generated images
    save_images(results, prefix="relight")
    
    return results


quick_prompts = [
    'sunshine from window',
    'neon light, city',
    'sunset over sea',
    'golden time',
    'sci-fi RGB glowing, cyberpunk',
    'natural lighting',
    'warm atmosphere, at home, bedroom',
    'magic lit',
    'evil, gothic, Yharnam',
    'light and shadow',
    'shadow from window',
    'soft studio lighting',
    'home atmosphere, cozy bedroom illumination',
    'neon, Wong Kar-wai, warm'
]
quick_prompts = [[x] for x in quick_prompts]


quick_subjects = [
    'modern sofa, high quality leather',
    'elegant dining table, polished wood',
    'luxurious bed, premium mattress',
    'minimalist office desk, clean design',
    'vintage wooden cabinet, antique finish',
]
quick_subjects = [[x] for x in quick_subjects]


class BGSource(Enum):
    UPLOAD = "Use Background Image"
    UPLOAD_FLIP = "Use Flipped Background Image"
    LEFT = "Left Light"
    RIGHT = "Right Light"
    TOP = "Top Light"
    BOTTOM = "Bottom Light"
    GREY = "Ambient"

# Add save function
def save_images(images, prefix="relight"):
    # Create output directory if it doesn't exist
    output_dir = Path("outputs")
    output_dir.mkdir(exist_ok=True)
    
    # Create timestamp for unique filenames
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    
    saved_paths = []
    for i, img in enumerate(images):
        if isinstance(img, np.ndarray):
            # Convert to PIL Image if numpy array
            img = Image.fromarray(img)
        
        # Create filename with timestamp
        filename = f"{prefix}_{timestamp}_{i+1}.png"
        filepath = output_dir / filename
        
        # Save image
        img.save(filepath)


    # print(f"Saved {len(saved_paths)} images to {output_dir}")
    return saved_paths


class MaskMover:
    def __init__(self):
        self.extracted_fg = None
        self.original_fg = None  # Store original foreground
        
    def set_extracted_fg(self, fg_image):
        """Store the extracted foreground with alpha channel"""
        if isinstance(fg_image, np.ndarray):
            self.extracted_fg = fg_image.copy()
            self.original_fg = fg_image.copy()
        else:
            self.extracted_fg = np.array(fg_image)
            self.original_fg = np.array(fg_image)
        return self.extracted_fg
    
    def create_composite(self, background, x_pos, y_pos, scale=1.0):
        """Create composite with foreground at specified position"""
        if self.original_fg is None or background is None:
            return background
        
        # Convert inputs to PIL Images
        if isinstance(background, np.ndarray):
            bg = Image.fromarray(background).convert('RGBA')
        else:
            bg = background.convert('RGBA')
        
        if isinstance(self.original_fg, np.ndarray):
            fg = Image.fromarray(self.original_fg).convert('RGBA')
        else:
            fg = self.original_fg.convert('RGBA')
        
        # Scale the foreground size
        new_width = int(fg.width * scale)
        new_height = int(fg.height * scale)
        fg = fg.resize((new_width, new_height), Image.LANCZOS)
        
        # Center the scaled foreground at the position
        x = int(x_pos - new_width / 2)
        y = int(y_pos - new_height / 2)
        
        # Create composite
        result = bg.copy()
        result.paste(fg, (x, y), fg)  # Use fg as the mask (requires fg to be in 'RGBA' mode)
        
        return np.array(result.convert('RGB'))  # Convert back to 'RGB' if needed
        
def get_depth(image):
    if image is None:
        return None
    # Convert from PIL/gradio format to cv2
    raw_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    # Get depth map
    depth = model.infer_image(raw_img) # HxW raw depth map
    # Normalize depth for visualization
    depth = ((depth - depth.min()) / (depth.max() - depth.min()) * 255).astype(np.uint8)
    # Convert to RGB for display
    depth_colored = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
    depth_colored = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB)
    return Image.fromarray(depth_colored)


from PIL import Image

def compress_image(image):
    # Convert Gradio image (numpy array) to PIL Image
    img = Image.fromarray(image)
    
    # Resize image if dimensions are too large
    max_size = 1024  # Maximum dimension size
    if img.width > max_size or img.height > max_size:
        ratio = min(max_size/img.width, max_size/img.height)
        new_size = (int(img.width * ratio), int(img.height * ratio))
        img = img.resize(new_size, Image.Resampling.LANCZOS)
    
    quality = 95  # Start with high quality
    img.save("compressed_image.jpg", "JPEG", quality=quality)  # Initial save
    
    # Check file size and adjust quality if necessary
    while os.path.getsize("compressed_image.jpg") > 100 * 1024:  # 100KB limit
        quality -= 5  # Decrease quality
        img.save("compressed_image.jpg", "JPEG", quality=quality)
        if quality < 20:  # Prevent quality from going too low
            break
    
    # Convert back to numpy array for Gradio
    compressed_img = np.array(Image.open("compressed_image.jpg"))
    return compressed_img


block = gr.Blocks().queue()
with block:
    with gr.Tab("Text"):
        with gr.Row():
            gr.Markdown("## Product Placement from Text")
        with gr.Row():
            with gr.Column():
                with gr.Row():
                    input_fg = gr.Image(type="numpy", label="Image", height=480)
                    output_bg = gr.Image(type="numpy", label="Preprocessed Foreground", height=480)
                with gr.Group():
                    prompt = gr.Textbox(label="Prompt")
                    bg_source = gr.Radio(choices=[e.value for e in BGSource],
                                        value=BGSource.GREY.value,
                                        label="Lighting Preference (Initial Latent)", type='value')
                example_quick_subjects = gr.Dataset(samples=quick_subjects, label='Subject Quick List', samples_per_page=1000, components=[prompt])
                example_quick_prompts = gr.Dataset(samples=quick_prompts, label='Lighting Quick List', samples_per_page=1000, components=[prompt])
                relight_button = gr.Button(value="Relight")

                with gr.Group():
                    with gr.Row():
                        num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                        seed = gr.Number(label="Seed", value=12345, precision=0)

                    with gr.Row():
                        image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64)
                        image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64)

                with gr.Accordion("Advanced options", open=False):
                    steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=15, step=1)
                    cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=2, step=0.01)
                    lowres_denoise = gr.Slider(label="Lowres Denoise (for initial latent)", minimum=0.1, maximum=1.0, value=0.9, step=0.01)
                    highres_scale = gr.Slider(label="Highres Scale", minimum=1.0, maximum=3.0, value=1.5, step=0.01)
                    highres_denoise = gr.Slider(label="Highres Denoise", minimum=0.1, maximum=1.0, value=0.5, step=0.01)
                    a_prompt = gr.Textbox(label="Added Prompt", value='best quality')
                    n_prompt = gr.Textbox(label="Negative Prompt", value='lowres, bad anatomy, bad hands, cropped, worst quality')
            with gr.Column():
                result_gallery = gr.Gallery(height=832, object_fit='contain', label='Outputs')
        with gr.Row():
            dummy_image_for_outputs = gr.Image(visible=False, label='Result')
            # gr.Examples(
            #     fn=lambda *args: ([args[-1]], None),
            #     examples=db_examples.foreground_conditioned_examples,
            #     inputs=[
            #         input_fg, prompt, bg_source, image_width, image_height, seed, dummy_image_for_outputs
            #     ],
            #     outputs=[result_gallery, output_bg],
            #     run_on_click=True, examples_per_page=1024
            # )
        ips = [input_fg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, lowres_denoise, bg_source]
        relight_button.click(fn=process_relight, inputs=ips, outputs=[output_bg, result_gallery])
        example_quick_prompts.click(lambda x, y: ', '.join(y.split(', ')[:2] + [x[0]]), inputs=[example_quick_prompts, prompt], outputs=prompt, show_progress=False, queue=False)
        example_quick_subjects.click(lambda x: x[0], inputs=example_quick_subjects, outputs=prompt, show_progress=False, queue=False)

    with gr.Tab("Background"):
        mask_mover = MaskMover()
        
        
        with gr.Row():
            gr.Markdown("## IC-Light (Relighting with Foreground and Background Condition)")
            gr.Markdown("💾 Generated images are automatically saved to 'outputs' folder")
        
        with gr.Row():
            with gr.Column():
                # Step 1: Input and Extract
                with gr.Row():
                    with gr.Group():
                        gr.Markdown("### Step 1: Extract Foreground")
                        input_image = gr.Image(type="numpy", label="Input Image", height=480)
                        # find_objects_button = gr.Button(value="Find Objects")
                        extract_button = gr.Button(value="Remove Background")
                    extracted_fg = gr.Image(type="numpy", label="Extracted Foreground", height=480)

                with gr.Row():    
                # Step 2: Background and Position
                    with gr.Group():
                        gr.Markdown("### Step 2: Position on Background")
                        input_bg = gr.Image(type="numpy", label="Background Image", height=480)
                        
                        with gr.Row():
                            x_slider = gr.Slider(
                                minimum=0,
                                maximum=1000,
                                label="X Position",
                                value=500,
                                visible=False
                            )
                            y_slider = gr.Slider(
                                minimum=0,
                                maximum=1000,
                                label="Y Position",
                                value=500,
                                visible=False
                            )
                            fg_scale_slider = gr.Slider(
                                label="Foreground Scale", 
                                minimum=0.01, 
                                maximum=3.0, 
                                value=1.0, 
                                step=0.01
                            )

                        editor = gr.ImageEditor(
                            type="numpy",
                            label="Position Foreground",
                            height=480,
                            visible=False
                        )
                        get_depth_button = gr.Button(value="Get Depth")
                        depth_image = gr.Image(type="numpy", label="Depth Image", height=480)
                
                # Step 3: Relighting Options
                with gr.Group():
                    gr.Markdown("### Step 3: Relighting Settings")
                    prompt = gr.Textbox(label="Prompt")
                    bg_source = gr.Radio(
                        choices=[e.value for e in BGSource],
                        value=BGSource.UPLOAD.value,
                        label="Background Source", 
                        type='value'
                    )

                    example_prompts = gr.Dataset(
                        samples=quick_prompts, 
                        label='Prompt Quick List', 
                        components=[prompt]
                    )
                    # bg_gallery = gr.Gallery(
                    #     height=450, 
                    #     label='Background Quick List', 
                    #     value=db_examples.bg_samples, 
                    #     columns=5, 
                    #     allow_preview=False
                    # )
                relight_button_bg = gr.Button(value="Relight")

                # Additional settings
                with gr.Group():
                    with gr.Row():
                        num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1)
                        seed = gr.Number(label="Seed", value=12345, precision=0)
                    with gr.Row():
                        image_width = gr.Slider(label="Image Width", minimum=256, maximum=1024, value=512, step=64)
                        image_height = gr.Slider(label="Image Height", minimum=256, maximum=1024, value=640, step=64)

                with gr.Accordion("Advanced options", open=False):
                    steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1)
                    cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=7.0, step=0.01)
                    highres_scale = gr.Slider(label="Highres Scale", minimum=1.0, maximum=2.0, value=1.2, step=0.01)
                    highres_denoise = gr.Slider(label="Highres Denoise", minimum=0.1, maximum=0.9, value=0.5, step=0.01)
                    a_prompt = gr.Textbox(label="Added Prompt", value='best quality')
                    n_prompt = gr.Textbox(
                        label="Negative Prompt",
                        value='lowres, bad anatomy, bad hands, cropped, worst quality'
                    )
                
            with gr.Column():
                result_gallery = gr.Image(height=832, label='Outputs')

        def extract_foreground(image):
            if image is None:
                return None, gr.update(visible=True), gr.update(visible=True)
            result, rgba = run_rmbg(image)
            mask_mover.set_extracted_fg(rgba)
            
            return result, gr.update(visible=True), gr.update(visible=True)


        original_bg = None

        extract_button.click(
            fn=extract_foreground,
            inputs=[input_image],
            outputs=[extracted_fg, x_slider, y_slider]
        )

        # find_objects_button.click(
        #     fn=find_objects,
        #     inputs=[input_image],
        #     outputs=[extracted_fg]
        # )
        
        get_depth_button.click(
            fn=get_depth,
            inputs=[input_bg],
            outputs=[depth_image]
        )

        # def update_position(background, x_pos, y_pos, scale):
        #     """Update composite when position changes"""
        #     global original_bg
        #     if background is None:
        #         return None
            
        #     if original_bg is None:
        #         original_bg = background.copy()
            
        #     # Convert string values to float
        #     x_pos = float(x_pos)
        #     y_pos = float(y_pos)
        #     scale = float(scale)
            
        #     return mask_mover.create_composite(original_bg, x_pos, y_pos, scale)

        class BackgroundManager:
            def __init__(self):
                self.original_bg = None

            def update_position(self, background, x_pos, y_pos, scale):
                """Update composite when position changes"""
                if background is None:
                    return None
                
                self.original_bg = background.copy()
                
                # Convert string values to float
                x_pos = float(x_pos)
                y_pos = float(y_pos)
                scale = float(scale)
                
                return mask_mover.create_composite(self.original_bg, x_pos, y_pos, scale)

        # Create an instance of BackgroundManager
        bg_manager = BackgroundManager()


        x_slider.change(
            fn=lambda bg, x, y, scale: bg_manager.update_position(bg, x, y, scale),
            inputs=[input_bg, x_slider, y_slider, fg_scale_slider],
            outputs=[input_bg]
        )

        y_slider.change(
            fn=lambda bg, x, y, scale: bg_manager.update_position(bg, x, y, scale),
            inputs=[input_bg, x_slider, y_slider, fg_scale_slider],
            outputs=[input_bg]
        )

        fg_scale_slider.change(
            fn=lambda bg, x, y, scale: bg_manager.update_position(bg, x, y, scale),
            inputs=[input_bg, x_slider, y_slider, fg_scale_slider],
            outputs=[input_bg]
        )

        # Update inputs list to include fg_scale_slider

    def process_relight_with_position(*args):
        if mask_mover.extracted_fg is None:
            gr.Warning("Please extract foreground first")
            return None
            
        background = args[1]  # Get background image
        x_pos = float(args[-3])  # x_slider value
        y_pos = float(args[-2])  # y_slider value
        scale = float(args[-1])  # fg_scale_slider value
        
        # Get original foreground size after scaling
        fg = Image.fromarray(mask_mover.original_fg)
        new_width = int(fg.width * scale)
        new_height = int(fg.height * scale)
        
        # Calculate crop region around foreground position
        crop_x = int(x_pos - new_width/2)
        crop_y = int(y_pos - new_height/2)
        crop_width = new_width
        crop_height = new_height
        
        # Add padding for context (20% extra on each side)
        padding = 0.2
        crop_x = int(crop_x - crop_width * padding)
        crop_y = int(crop_y - crop_height * padding)
        crop_width = int(crop_width * (1 + 2 * padding))
        crop_height = int(crop_height * (1 + 2 * padding))
        
        # Ensure crop dimensions are multiples of 8
        crop_width = ((crop_width + 7) // 8) * 8
        crop_height = ((crop_height + 7) // 8) * 8
        
        # Ensure crop region is within image bounds
        bg_height, bg_width = background.shape[:2]
        crop_x = max(0, min(crop_x, bg_width - crop_width))
        crop_y = max(0, min(crop_y, bg_height - crop_height))
        
        # Get actual crop dimensions after boundary check
        crop_width = min(crop_width, bg_width - crop_x)
        crop_height = min(crop_height, bg_height - crop_y)
        
        # Ensure dimensions are multiples of 8 again
        crop_width = (crop_width // 8) * 8
        crop_height = (crop_height // 8) * 8
        
        # Crop region from background
        crop_region = background[crop_y:crop_y+crop_height, crop_x:crop_x+crop_width]
        
        # Create composite in cropped region
        fg_local_x = int(new_width/2 + crop_width*padding)
        fg_local_y = int(new_height/2 + crop_height*padding)
        cropped_composite = mask_mover.create_composite(crop_region, fg_local_x, fg_local_y, scale)
        
        # Process the cropped region
        crop_args = list(args)
        crop_args[0] = cropped_composite
        crop_args[1] = crop_region
        crop_args[3] = crop_width
        crop_args[4] = crop_height
        crop_args = crop_args[:-3]  # Remove position and scale arguments
        
        # Get relit result
        relit_crop = process_relight_bg(*crop_args)[0]
        
        # Resize relit result to match crop dimensions if needed
        if relit_crop.shape[:2] != (crop_height, crop_width):
            relit_crop = resize_without_crop(relit_crop, crop_width, crop_height)
        
        # Place relit crop back into original background
        result = background.copy()
        result[crop_y:crop_y+crop_height, crop_x:crop_x+crop_width] = relit_crop
        
        return result
    
    ips_bg = [input_fg, input_bg, prompt, image_width, image_height, num_samples, seed, steps, a_prompt, n_prompt, cfg, highres_scale, highres_denoise, bg_source]

    # Update button click events with new inputs list
    relight_button_bg.click(
        fn=process_relight_with_position,
        inputs=ips_bg,
        outputs=[result_gallery]
    )
    

    example_prompts.click(
        fn=lambda x: x[0],
        inputs=example_prompts,
        outputs=prompt,
        show_progress=False,
        queue=False
    )



block.launch(server_name='0.0.0.0', share=True)