Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.ipynb_checkpoints/README-checkpoint.md +116 -0
.ipynb_checkpoints/model_index-checkpoint.json +25 -0
.ipynb_checkpoints/pipeline_waifu-checkpoint.py +376 -23
.ipynb_checkpoints/test-checkpoint.ipynb +0 -0
.ipynb_checkpoints/waifu-checkpoint.png +0 -0
1.png +0 -0
README.md +2 -1
Untitled.ipynb +1 -1
__pycache__/pipeline_waifu.cpython-310.pyc +0 -0
pipeline_waifu.py +5 -5
promo.png +2 -2
test.ipynb +0 -0
transformer/diffusion_pytorch_model.fp16.safetensors +1 -1
waifu.png +0 -0

.ipynb_checkpoints/README-checkpoint.md ADDED Viewed

	@@ -0,0 +1,116 @@

+---
+license: apache-2.0
+pipeline_tag: text-to-image
+---
+# Work / train in progress!
+![image](./promo.png)
+⚡️Waifu: efficient high-resolution waifu synthesis
+waifu is a free text-to-image model that can efficiently generate images in 80 languages. Our goal is to create a small model without compromising on quality.
+## Core designs include:
+(1) [**AuraDiffusion/16ch-vae**](https://huggingface.co/AuraDiffusion/16ch-vae): A fully open source 16ch VAE. Natively trained in fp16. \
+(2) [**Linear DiT**](https://github.com/NVlabs/Sana): we use 1.6b DiT transformer with linear attention. \
+(3) [**MEXMA-SigLIP**](https://huggingface.co/visheratin/mexma-siglip): MEXMA-SigLIP is a model that combines the [MEXMA](https://huggingface.co/facebook/MEXMA) multilingual text encoder and an image encoder from the [SigLIP](https://huggingface.co/timm/ViT-SO400M-14-SigLIP-384) model. This allows us to get a high-performance CLIP model for 80 languages.. \
+(4) Other: we use Flow-Euler sampler, Adafactor-Fused optimizer and bf16 precision for training, and combine efficient caption labeling (MoonDream, CogVlM, Human, Gpt's) and danbooru tags to accelerate convergence.
+## Example
+```py
+import torch
+from diffusers import DiffusionPipeline
+from transformers import XLMRobertaTokenizerFast,XLMRobertaModel
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.models import AutoencoderKL
+from diffusers import SanaTransformer2DModel
+pipe_id = "AiArtLab/waifu-2b"
+variant = "fp16"
+# tokenizer
+tokenizer = XLMRobertaTokenizerFast.from_pretrained(
+    pipe_id,
+    subfolder="tokenizer"
+)
+# text_encoder
+text_encoder = XLMRobertaModel.from_pretrained(
+    pipe_id,
+    variant=variant,
+    subfolder="text_encoder",
+    add_pooling_layer=False
+).to("cuda")
+# scheduler
+scheduler = FlowMatchEulerDiscreteScheduler(shift=1.0)
+# VAE
+vae = AutoencoderKL.from_pretrained(
+    pipe_id,
+    variant=variant,
+    subfolder="vae"
+).to("cuda")
+# Transformer
+transformer = SanaTransformer2DModel.from_pretrained(
+    pipe_id,
+    variant=variant,
+    subfolder="transformer"
+).to("cuda")
+# Pipeline
+pipeline = DiffusionPipeline.from_pretrained(
+    pipe_id,
+    tokenizer=tokenizer,
+    text_encoder=text_encoder,
+    vae=vae,
+    transformer=transformer,
+    trust_remote_code=True,
+).to("cuda")
+print(pipeline)
+prompt = 'аниме девушка, waifu, يبتسم جنسيا , sur le fond de la tour Eiffel'
+generator = torch.Generator(device="cuda").manual_seed(42)
+image = pipeline(
+    prompt = prompt,
+    negative_prompt = "",
+    generator=generator,
+)[0]
+for img in image:
+    img.show()
+    img.save('waifu.png')
+```
+![image](./waifu.png)
+## Donations
+We are a small GPU poor group of enthusiasts (current train budget ~$2k)
+Please contact with us if you may provide some GPU's on training
+DOGE: DEw2DR8C7BnF8GgcrfTzUjSnGkuMeJhg83
+![image](./1.png)
+A fluffy domestic cat with piercing green eyes sits attentively in a sunlit room filled natural light streaming through large windows, its soft fur reflecting warm hues of orange from the golden glow casting across its sleek body and delicate features
+## Contacts
+[recoilme](https://t.me/recoilme)
+## How to cite
+```bibtex
+@misc{Waifu,
+    url    = {[https://huggingface.co/AiArtLab/waifu-2b](https://huggingface.co/AiArtLab/waifu-2b)},
+    title  = {waifu-2b},
+    author = {recoilme, muinez, femboysLover}
+}
+```

.ipynb_checkpoints/model_index-checkpoint.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_class_name": ["pipeline_waifu", "WaifuPipeline"],
+  "_diffusers_version": "0.32.0.dev0",
+  "_name_or_path": "AiArtLab/waifu-2b",
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "XLMRobertaModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "XLMRobertaTokenizerFast"
+  ],
+  "transformer": [
+    "diffusers",
+    "SanaTransformer2DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

.ipynb_checkpoints/pipeline_waifu-checkpoint.py CHANGED Viewed

@@ -1,23 +1,113 @@
 import torch
 from diffusers import DiffusionPipeline
-from typing import Callable, Dict, List, Optional, Tuple, Union
-# waifu
-# tokenizer
-from transformers import XLMRobertaTokenizerFast
-# text_encoder
-from transformers import XLMRobertaModel
-# scheduler
-from diffusers import FlowMatchEulerDiscreteScheduler
-# VAE
-from diffusers.models import AutoencoderKL
-# Transformer
 from diffusers import SanaTransformer2DModel
 class WaifuPipeline(DiffusionPipeline):
     r"""
-    Pipeline for text-to-image generation using [waifu](https://github.com/recoilme/waifu).
     """
     model_cpu_offload_seq = "text_encoder->transformer->vae"
@@ -37,9 +127,272 @@ class WaifuPipeline(DiffusionPipeline):
             tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
         )
-        self.vae_scale_factor = 8
         self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
     @torch.no_grad()
     def __call__(
         self,
@@ -60,11 +413,11 @@ class WaifuPipeline(DiffusionPipeline):
         negative_prompt_embeds: Optional[torch.Tensor] = None,
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
-        return_dict: bool = True,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
-    ) -> Union[SanaPipelineOutput, Tuple]:
         """
         Function invoked when calling the pipeline for generation.
@@ -138,15 +491,15 @@ class WaifuPipeline(DiffusionPipeline):
         Examples:
         Returns:
-            [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.sana.pipeline_output.SanaPipelineOutput`] is returned,
                 otherwise a `tuple` is returned where the first element is a list with the generated images
         """
-        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
-            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt,
             height,
@@ -273,16 +626,16 @@ class WaifuPipeline(DiffusionPipeline):
         else:
             latents = latents.to(self.vae.dtype)
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
-            if use_resolution_binning:
-                image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
         if not output_type == "latent":
             image = self.image_processor.postprocess(image, output_type=output_type)
         # Offload all models
         self.maybe_free_model_hooks()
         if not return_dict:
             return (image,)
-        return SanaPipelineOutput(images=image)

+# Copyright 2024 PixArt-Sigma Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Callable, Dict, List, Optional, Union
 import torch
+from diffusers.image_processor import PixArtImageProcessor
+from diffusers.utils.torch_utils import randn_tensor
 from diffusers import DiffusionPipeline
+from transformers import XLMRobertaTokenizerFast,XLMRobertaModel
 from diffusers import SanaTransformer2DModel
+from diffusers.models import AutoencoderKL
+from diffusers import FlowMatchEulerDiscreteScheduler
+from typing import List, Union
+import numpy as np
+import PIL.Image
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import WaifuPipeline
+        >>> pipe = WaifuPipeline.from_pretrained(
+        ...     "AiArtLab/waifu-2b"
+        ... )
+        >>> pipe.to("cuda")
+        >>> image = pipe(prompt='a cyberpunk cat with a neon sign that says "Sana"')[0]
+        >>> image[0].save("output.png")
+        ```
+"""
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
 class WaifuPipeline(DiffusionPipeline):
     r"""
+    Pipeline for text-to-image generation using [Sana](https://huggingface.co/papers/2410.10629).
     """
     model_cpu_offload_seq = "text_encoder->transformer->vae"
             tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
         )
+        self.vae_scale_factor = (
+            8
+        )
         self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_attention_mask: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                PixArt-Alpha, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. For Sana, it's should be the embeddings of the "" string.
+            max_sequence_length (`int`, defaults to 512): Maximum sequence length to use for the prompt.
+        """
+        if device is None:
+            device = self._execution_device
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if self.tokenizer is not None:
+            self.tokenizer.padding_side = "right"
+        max_length = max_sequence_length
+        select_index = [0] + list(range(-max_length + 1, 0))
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt)
+            max_length_all = max_length
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length_all,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_attention_mask = text_inputs.attention_mask
+            prompt_attention_mask = prompt_attention_mask.to(device)
+            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
+            prompt_embeds = prompt_embeds[0][:, select_index]
+            prompt_attention_mask = prompt_attention_mask[:, select_index]
+        if self.transformer is not None:
+            dtype = self.transformer.dtype
+        elif self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        else:
+            dtype = None
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            #print("do_classifier_free_guidance and negative_prompt_embeds is None")
+            uncond_tokens = [negative_prompt] * batch_size if isinstance(negative_prompt, str) else negative_prompt
+            uncond_tokens = self._text_preprocessing(uncond_tokens)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            negative_prompt_attention_mask = uncond_input.attention_mask
+            negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_on_step_end_tensor_inputs=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 64 != 0 or width % 64 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+    def _text_preprocessing(self, text):
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+        def process(text: str):
+            text = text.lower().strip()
+            return text
+        return [process(t) for t in text]
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
     @torch.no_grad()
     def __call__(
         self,
         negative_prompt_embeds: Optional[torch.Tensor] = None,
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
         output_type: Optional[str] = "pil",
+        return_dict: bool = False,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
+    ) -> Union[List[PIL.Image.Image], np.ndarray]:
         """
         Function invoked when calling the pipeline for generation.
         Examples:
         Returns:
+            Union[List[PIL.Image.Image], np.ndarray] is returned,
                 otherwise a `tuple` is returned where the first element is a list with the generated images
         """
+#       if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+#           callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt,
             height,
         else:
             latents = latents.to(self.vae.dtype)
             image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
         if not output_type == "latent":
             image = self.image_processor.postprocess(image, output_type=output_type)
+            #image = numpy_to_pil(image)
         # Offload all models
+        #print("Offload all models 4")
         self.maybe_free_model_hooks()
         if not return_dict:
             return (image,)
+        return Union[List[PIL.Image.Image], np.ndarray]

.ipynb_checkpoints/test-checkpoint.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

.ipynb_checkpoints/waifu-checkpoint.png ADDED Viewed

1.png CHANGED Viewed

README.md CHANGED Viewed

@@ -2,7 +2,8 @@
 license: apache-2.0
 pipeline_tag: text-to-image
 ---
-# Work / train in progress
 ⚡️Waifu: efficient high-resolution waifu synthesis

 license: apache-2.0
 pipeline_tag: text-to-image
 ---
+# Work / train in progress!
+![image](./promo.png)
 ⚡️Waifu: efficient high-resolution waifu synthesis

Untitled.ipynb CHANGED Viewed

@@ -172,7 +172,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
   }
  },
  "nbformat": 4,

    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

__pycache__/pipeline_waifu.cpython-310.pyc ADDED Viewed

Binary file (20.3 kB). View file

pipeline_waifu.py CHANGED Viewed

@@ -115,11 +115,11 @@ class WaifuPipeline(DiffusionPipeline):
     def __init__(
         self,
-        tokenizer: None,
-        text_encoder: None,
-        vae: None,
-        transformer: None,
-        scheduler: None,
     ):
         super().__init__()

     def __init__(
         self,
+        tokenizer: XLMRobertaTokenizerFast,
+        text_encoder: XLMRobertaModel,
+        vae: AutoencoderKL,
+        transformer: SanaTransformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
     ):
         super().__init__()

promo.png CHANGED Viewed

Git LFS Details

SHA256: 1505648f1fd5b4f352bb621d12942db147f56ac9fce42069ce5e5e007b7a1d98
Pointer size: 132 Bytes
Size of remote file: 1.81 MB

Git LFS Details

SHA256: 12a5ea2c007c532b68f6ce3f60572ce05e3e08edc0f814b077822937e727e500
Pointer size: 132 Bytes
Size of remote file: 1.69 MB

test.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

transformer/diffusion_pytorch_model.fp16.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b8de92385645ac8c8e4a6ac9072ecc832a9f639d6acec5726ab87943f880791
 size 3203093344

 version https://git-lfs.github.com/spec/v1
+oid sha256:2d41fc249d188765f985a9502e806a8ea239580ba95387ed323dfb6304a48515
 size 3203093344

waifu.png CHANGED Viewed