Spaces:

SDbiaseval
/

stablediffusionembeddings

No application file

File size: 6,988 Bytes

99cea46

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "873b1354-b85f-4c5b-9163-95190f07b39a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import zipfile\n",
    "from PIL import Image\n",
    "from io import BytesIO\n",
    "import numpy as np\n",
    "from datasets import load_dataset\n",
    "import torch\n",
    "from diffusers import AutoencoderKL, UNet2DModel, UNet2DConditionModel\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "35949720-3e01-43b0-8487-a1b2131d5a9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_image(image):\n",
    "    w, h = image.size\n",
    "    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32\n",
    "    image = image.resize((w, h), resample=Image.Resampling.LANCZOS)\n",
    "    image = np.array(image).astype(np.float32) / 255.0\n",
    "    image = image[None].transpose(0, 3, 1, 2)\n",
    "    return 2.0 * image - 1.0\n",
    "\n",
    "def vae_embedding(preprocessed, num_samples=5, device=\"cuda\"):\n",
    "    with torch.no_grad():\n",
    "        processed_image = preprocessed.to(device=device)\n",
    "        latent_dist = vae.encode(processed_image).latent_dist\n",
    "        t = [0.18215*latent_dist.sample().to(\"cpu\").squeeze() for i in range(num_samples)] # sample num_samples latent vecs\n",
    "        t = torch.stack(t) # stack them\n",
    "        return torch.mean(t, axis=0).numpy() #average them. output shape: (4,64,64)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6ebd9d84-98f7-4883-ac4b-0ec875b86911",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration SDbiaseval--dataset-cc8e38e46c1acd54\n",
      "Found cached dataset parquet (/mnt/1da05489-3812-4f15-a6e5-c8d3c57df39e/cache/huggingface/SDbiaseval___parquet/SDbiaseval--dataset-cc8e38e46c1acd54/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f184861d2e2749c9b7c1c1ea3910be27",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 196 ms, sys: 23.3 ms, total: 219 ms\n",
      "Wall time: 2.51 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# dset = load_dataset(\"./dataset.py\", ignore_verifications=True) This uses the loading script and loads data from the zipped folders\n",
    "dset = load_dataset(\"SDbiaseval/dataset\")\n",
    "ds = dset[\"train\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fd832e2b-6ced-43ca-a4ca-fd54f523d22e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "vae = AutoencoderKL.from_pretrained(\"CompVis/stable-diffusion-v1-4\", subfolder=\"vae\");\n",
    "vae.eval()\n",
    "vae.to(\"cuda\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b2af2692-a372-4b96-8250-8c83c122457d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19554 batches of 16. Last batch of size 15.\n"
     ]
    }
   ],
   "source": [
    "ix = np.arange(len(ds))\n",
    "np.random.shuffle(ix)\n",
    "batch_size = 16\n",
    "batche_indices = np.array_split(ix, np.ceil(len(ix)/batch_size))\n",
    "print(f\"{len(batche_indices)} batches of {batch_size}. Last batch of size {len(batche_indices[-1])}.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "8a54fdf1-f0e5-487e-b53d-afc8dbcc989c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9h 52min 30s, sys: 2min 25s, total: 9h 54min 55s\n",
      "Wall time: 7h 54min 48s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "embs = []\n",
    "for i in batche_indices:\n",
    "    imx = ds.select(i)[\"image\"]\n",
    "    preprocessed = np.concatenate([preprocess_image(im) for im in imx])\n",
    "    emb = vae_embedding(torch.from_numpy(preprocessed), num_samples=10)\n",
    "    embs.append(emb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "06d9346c-912f-4e24-a0ff-d5386c1780a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('embs.pkl', 'wb') as f:\n",
    "    pickle.dump(embs, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d0cbe87-dfb2-4c59-adf5-b4d015e2d441",
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = np.concatenate(embs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a6e826a9-93e0-4298-813d-9c42d139ff96",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"embs.pkl\", \"rb\") as f:\n",
    "    embeddings = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0783bb60-5439-4a62-a4ac-15198688b331",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.82 s, sys: 4.34 s, total: 8.16 s\n",
      "Wall time: 8.2 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "embeddings = np.concatenate(embeddings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "50369f37-a4f1-4a7c-89dd-b4ef9a8ebf8b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(312860, 4, 64, 64)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "93f1ea7b-cbcd-49c3-a7c7-4ea26012f9b3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 0 ns, sys: 10.3 s, total: 10.3 s\n",
      "Wall time: 10.3 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "with open('vae_embeddings.npy', 'wb') as f:\n",
    "    np.save(f, embeddings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b316682-f5cc-44d7-a8ed-f1da9b6c3089",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}