Spaces:

srijaydeshpande
/

DiffusionGenerator

Sleeping

App Files Files Community

DiffusionGenerator / diffusion.py

srijaydeshpande

Update diffusion.py

16b73bf verified 5 months ago

raw

history blame contribute delete

13.6 kB

	import torch
	import matplotlib.pyplot as plt
	from torchvision.utils import save_image
	from torchvision import transforms
	from torch.utils.data import DataLoader
	import numpy as np, os
	from torch import nn
	import math
	import torch.nn.functional as F
	from torch.optim import Adam
	from typing import Optional
	import random


	def mkdir(dir):
	if not os.path.exists(dir):
	os.makedirs(dir)


	def get_beta_schedule(beta_schedule, beta_start, beta_end, num_diffusion_timesteps):
	def sigmoid(x):
	return 1 / (np.exp(-x) + 1)

	if beta_schedule == "quad":
	betas = (
	np.linspace(
	beta_start ** 0.5,
	beta_end ** 0.5,
	num_diffusion_timesteps,
	dtype=np.float64,
	)
	** 2
	)

	elif beta_schedule == "linear":
	betas = np.linspace(
	beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
	)
	elif beta_schedule == "const":
	betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
	elif beta_schedule == "jsd": # 1/T, 1/(T-1), 1/(T-2), ..., 1
	betas = 1.0 / np.linspace(
	num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
	)
	elif beta_schedule == "sigmoid":
	betas = np.linspace(-6, 6, num_diffusion_timesteps)
	betas = sigmoid(betas) * (beta_end - beta_start) + beta_start
	else:
	raise NotImplementedError(beta_schedule)
	assert betas.shape == (num_diffusion_timesteps,)
	betas = torch.from_numpy(betas).float()
	return betas


	def get_index_from_list(vals, t, x_shape):
	"""
	Returns a specific index t of a passed list of values vals
	while considering the batch dimension.
	"""
	batch_size = t.shape[0]
	out = vals.gather(-1, t.cpu())
	return out.reshape(batch_size, ((1,) (len(x_shape) - 1))).to(t.device)


	def forward_diffusion_sample(x, t, device="cpu"):
	"""
	Takes an image and a timestep as input and
	returns the noisy version of it
	"""
	noise = torch.randn_like(x) # gaussian noise
	# noise = torch.FloatTensor(x.shape).uniform_(-1, 1) #uniform distribution noise
	sqrt_alphas_cumprod_t = get_index_from_list(sqrt_alphas_cumprod, t, x.shape)
	sqrt_one_minus_alphas_cumprod_t = get_index_from_list(
	sqrt_one_minus_alphas_cumprod, t, x.shape
	)
	# print("coeff stats ",sqrt_alphas_cumprod_t, " and ", sqrt_one_minus_alphas_cumprod_t)
	# mean + variance
	return sqrt_alphas_cumprod_t.to(device) * x.to(device) \
	+ sqrt_one_minus_alphas_cumprod_t.to(device) * noise.to(device), noise.to(device)


	class Block(nn.Module):
	def __init__(self, in_ch, out_ch, time_emb_dim, up=False):
	super().__init__()
	self.time_mlp = nn.Linear(time_emb_dim, out_ch)
	if up:
	self.conv1 = nn.Conv2d(2 * in_ch, out_ch, 3, padding=1)
	self.transform = nn.ConvTranspose2d(out_ch, out_ch, 4, 2, 1)
	else:
	self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1)
	self.transform = nn.Conv2d(out_ch, out_ch, 4, 2, 1)
	self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1)
	self.bnorm1 = nn.BatchNorm2d(out_ch)
	self.bnorm2 = nn.BatchNorm2d(out_ch)
	self.relu = nn.LeakyReLU(0.2)

	def forward(self, x, t, ):
	# First Conv
	h = self.bnorm1(self.relu(self.conv1(x)))
	# Time embedding
	time_emb = self.relu(self.time_mlp(t))
	# Extend last 2 dimensions
	time_emb = time_emb[(...,) + (None,) * 2]
	# Add time channel
	h = h + time_emb
	# Second Conv
	h = self.bnorm2(self.relu(self.conv2(h)))
	# Down or Upsample
	return self.transform(h)


	class SinusoidalPositionEmbeddings(nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.dim = dim

	def forward(self, time):
	device = time.device
	half_dim = self.dim // 2
	embeddings = math.log(10000) / (half_dim - 1)
	embeddings = torch.exp(torch.arange(half_dim, device=device) * -embeddings)
	embeddings = time[:, None] * embeddings[None, :]
	embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
	return embeddings


	class CrossAttention(nn.Module):
	"""
	### Cross Attention Layer
	This falls-back to self-attention when conditional embeddings are not specified.
	"""

	use_flash_attention: bool = True

	def __init__(self, d_model: int, d_cond: int, n_heads: int, d_head: int, is_inplace: bool = False):
	"""
	:param d_model: is the input embedding size
	:param n_heads: is the number of attention heads
	:param d_head: is the size of a attention head
	:param d_cond: is the size of the conditional embeddings
	:param is_inplace: specifies whether to perform the attention softmax computation inplace to
	save memory
	"""
	super().__init__()

	self.is_inplace = is_inplace
	self.n_heads = n_heads
	self.d_head = d_head

	# Attention scaling factor
	self.scale = d_head ** -0.5

	# Query, key and value mappings
	d_attn = d_head * n_heads
	self.to_q = nn.Linear(d_model, d_attn, bias=False)
	self.to_k = nn.Linear(d_cond, d_attn, bias=False)
	self.to_v = nn.Linear(d_cond, d_attn, bias=False)

	# Final linear layer
	self.to_out = nn.Sequential(nn.Linear(d_attn, d_model))

	def forward(self, x: torch.Tensor, cond: Optional[torch.Tensor] = None):
	"""
	:param x: are the input embeddings of shape `[batch_size, height * width, d_model]`
	:param cond: is the conditional embeddings of shape `[batch_size, n_cond, d_cond]`
	"""

	# If `cond` is `None` we perform self attention
	has_cond = cond is not None
	if not has_cond:
	cond = x

	# Get query, key and value vectors
	q = self.to_q(x)
	k = self.to_k(cond)
	v = self.to_v(cond)

	return self.normal_attention(q, k, v)

	def normal_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
	"""
	#### Normal Attention

	:param q: are the query vectors before splitting heads, of shape `[batch_size, seq, d_attn]`
	:param k: are the query vectors before splitting heads, of shape `[batch_size, seq, d_attn]`
	:param v: are the query vectors before splitting heads, of shape `[batch_size, seq, d_attn]`
	"""

	# Split them to heads of shape `[batch_size, seq_len, n_heads, d_head]`
	q = q.view(*q.shape[:2], self.n_heads, -1)
	k = k.view(*k.shape[:2], self.n_heads, -1)
	v = v.view(*v.shape[:2], self.n_heads, -1)

	# Calculate attention $\frac{Q K^\top}{\sqrt{d_{key}}}$
	attn = torch.einsum('bihd,bjhd->bhij', q, k) * self.scale

	# Compute softmax
	# $$\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_{key}}}\Bigg)$$
	if self.is_inplace:
	half = attn.shape[0] // 2
	attn[half:] = attn[half:].softmax(dim=-1)
	attn[:half] = attn[:half].softmax(dim=-1)
	else:
	attn = attn.softmax(dim=-1)

	# Compute attention output
	# $$\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_{key}}}\Bigg)V$$
	out = torch.einsum('bhij,bjhd->bihd', attn, v)
	# Reshape to `[batch_size, height * width, n_heads * d_head]`
	out = out.reshape(*out.shape[:2], -1)
	# Map to `[batch_size, height * width, d_model]` with a linear layer
	return self.to_out(out)


	class SimpleUnet(nn.Module):
	def __init__(self):
	super().__init__()
	image_channels = 3
	# down_channels = (64, 128, 256, 512, 1024)
	# up_channels = (1024, 512, 256, 128, 64)
	down_channels = (16, 32, 64, 128, 256)
	up_channels = (256, 128, 64, 32, 16)
	out_dim = 1
	time_emb_dim = 32

	# Time embedding
	self.time_mlp = nn.Sequential(
	SinusoidalPositionEmbeddings(time_emb_dim),
	nn.Linear(time_emb_dim, time_emb_dim),
	nn.ReLU()
	)

	# Initial projection
	self.conv0 = nn.Conv2d(image_channels, down_channels[0], 3, padding=1)

	# Downsample
	self.downs = nn.ModuleList([Block(down_channels[i], down_channels[i + 1], \
	time_emb_dim) \
	for i in range(len(down_channels) - 1)])
	# Upsample
	self.ups = nn.ModuleList([Block(up_channels[i], up_channels[i + 1], \
	time_emb_dim, up=True) \
	for i in range(len(up_channels) - 1)])

	self.silu = nn.SiLU()

	self.output = nn.Conv2d(up_channels[-1], 3, out_dim)

	self.apply_tanh = nn.Tanh()

	self.cross_attention_module = CrossAttention(3, 32, 16, 16)

	def forward(self, x, y, timestep):

	# Embedd class condition using cross attention
	batch_size = x.shape[0]
	y = self.time_mlp(y)
	y = y[:, None, :]
	x = x.permute(0, 2, 3, 1).view(batch_size, IMG_SIZE * IMG_SIZE, 3)
	x2 = x + self.cross_attention_module(x, y)
	x2 = x2.view(batch_size, IMG_SIZE, IMG_SIZE, 3).permute(0, 3, 1, 2)

	# Embedd time
	t = self.time_mlp(timestep)

	# Initial conv
	x2 = self.conv0(x2)

	# Unet
	residual_inputs = []
	for down in self.downs:
	x2 = down(x2, t)
	residual_inputs.append(x2)
	for up in self.ups:
	residual_x2 = residual_inputs.pop()
	# Add residual x2 as additional channels
	x2 = torch.cat((x2, residual_x2), dim=1)
	x2 = up(x2, t)

	x2 = self.silu(x2)

	output = self.output(x2)

	return output


	def get_loss(model, x_0, t):
	latent, condition = x_0 # both latents and condition have same shap
	latent = latent.cuda()
	condition = condition.cuda()
	x_noisy, noise = forward_diffusion_sample(latent, t, device)
	noise_pred = model(x_noisy, condition, t)

	# return F.l1_loss(noise, noise_pred)
	return F.mse_loss(noise, noise_pred)


	@torch.no_grad()
	def sample_timestep(x, model, y, t):
	betas_t = get_index_from_list(betas, t, x.shape)
	sqrt_one_minus_alphas_cumprod_t = get_index_from_list(
	sqrt_one_minus_alphas_cumprod, t, x.shape
	)
	sqrt_recip_alphas_t = get_index_from_list(sqrt_recip_alphas, t, x.shape)

	# Call model (current image - noise prediction)
	model_mean = sqrt_recip_alphas_t * (
	x - (betas_t / sqrt_one_minus_alphas_cumprod_t) * model(x, y, t)
	)
	posterior_variance_t = get_index_from_list(posterior_variance, t, x.shape)

	# print("model prediction stats ",torch.max(model(x, y, t)), " and ", torch.min(model(x, y, t)))

	if t == 0:
	return model_mean
	else:
	noise = torch.randn_like(x)
	return model_mean + torch.sqrt(posterior_variance_t) * noise


	def show_tensor_image(image):
	reverse_transforms = transforms.Compose([
	transforms.Lambda(lambda t: (t + 1) / 2),
	transforms.Lambda(lambda t: t.permute(1, 2, 0)), # CHW to HWC
	transforms.Lambda(lambda t: t * 255.),
	transforms.Lambda(lambda t: t.numpy().astype(np.uint8)),
	transforms.ToPILImage(),
	])

	# Take first image of batch
	if len(image.shape) == 4:
	image = image[0, :, :, :]
	plt.imshow(reverse_transforms(image))


	def generate_latent(model_dir, cancer_type, output_dir):
	if (cancer_type == 'benign'):
	model_name = "digestpath_mask_benign_default.pt"
	else:
	model_name = "digestpath_mask_malignant_default.pt"

	device = "cuda" if torch.cuda.is_available() else "cpu"


	model_path = os.path.join(model_dir, model_name)

	model = SimpleUnet()
	model.to(device)

	model.load_state_dict(torch.load(model_path))
	print("model loaded")
	model.eval()

	# cancer_grade = random.randint(0, 1)
	condition = torch.tensor([1]).cuda() # benign:0/malignant:1 grade cancer
	# condition = torch.full([1, 1, IMG_SIZE, IMG_SIZE], condition).float().cuda()
	img = torch.randn((1, 3, IMG_SIZE, IMG_SIZE), device=device)
	for j in range(0, T)[::-1]:
	t = torch.full((1,), j, device=device, dtype=torch.long)
	img = sample_timestep(img, model, condition, t)
	print("sampled image ", torch.max(img), " and ", torch.min(img))
	save_image(img, os.path.join(output_dir, "sample.png"))
	torch.save(img, os.path.join(output_dir, "sample.pt"))

	# Define beta schedule
	T = 1000
	IMG_SIZE = 64

	betas = get_beta_schedule(beta_schedule="linear",
	beta_start=0.0001,
	beta_end=0.02,
	num_diffusion_timesteps=T)

	# Pre-calculate different terms for closed form
	alphas = 1. - betas
	alphas_cumprod = torch.cumprod(alphas, axis=0)
	sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod) # root(alpha_bar)
	sqrt_one_minus_alphas_cumprod = torch.sqrt(1. - alphas_cumprod) # root(1-alpha_bar)
	sqrt_recip_alphas = torch.sqrt(1.0 / alphas) # 1/root(alpha)
	alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value=1.0)
	posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)


	# model_dir = "trained_models/diffusion"
	# output_dir = r"F:\Datasets\DigestPath\scene_generation\all\1000\256\test\output\benign"
	# generate_latent(model_dir, 'malignant', output_dir)