Remove unnecessary code, hide prints behind debug flag, hide warnings

Browse files

Files changed (6) hide show

float8_quantize.py +15 -24
flux_emphasis.py +16 -11
flux_pipeline.py +26 -4
modules/conditioner.py +0 -13
quantize_swap_and_dispatch.py +0 -274
util.py +1 -0

float8_quantize.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from typing import Any, Mapping
 import torch
 import torch.nn as nn
 from torchao.float8.float8_utils import (
@@ -11,6 +10,7 @@ import math
 from torch.compiler import is_compiling
 from torch import __version__
 from torch.version import cuda
 IS_TORCH_2_4 = __version__ < (2, 4, 9)
 LT_TORCH_2_4 = __version__ < (2, 4)
@@ -29,23 +29,7 @@ try:
 except ImportError:
     CublasLinear = type(None)
-def check_scale_tensor(tensor):
-    return (
-        tensor is not None
-        and isinstance(tensor, torch.Tensor)
-        and tensor.dtype == torch.float32
-        and tensor.numel() == 1
-        and tensor != torch.zeros_like(tensor)
-    )
-def check_scale_in_state_dict(state_dict, key):
-    return key in state_dict and check_scale_tensor(state_dict[key])
-def check_scales_given_state_dict_and_keys(state_dict, keys):
-    return all(check_scale_in_state_dict(state_dict, key) for key in keys)
 class F8Linear(nn.Module):
@@ -245,6 +229,7 @@ class F8Linear(nn.Module):
             init.uniform_(self.bias, -bound, bound)
         self.quantize_weight()
         self.max_value = torch.finfo(self.float8_dtype).max
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.input_scale_initialized or is_compiling():
@@ -280,7 +265,7 @@ class F8Linear(nn.Module):
         linear: nn.Linear,
         float8_dtype=torch.float8_e4m3fn,
         input_float8_dtype=torch.float8_e5m2,
-    ):
         f8_lin = cls(
             in_features=linear.in_features,
             out_features=linear.out_features,
@@ -300,7 +285,7 @@ def recursive_swap_linears(
     model: nn.Module,
     float8_dtype=torch.float8_e4m3fn,
     input_float8_dtype=torch.float8_e5m2,
-):
     """
     Recursively swaps all nn.Linear modules in the given model with F8Linear modules.
@@ -337,23 +322,29 @@ def recursive_swap_linears(
 @torch.inference_mode()
 def quantize_flow_transformer_and_dispatch_float8(
-    flow_model: nn.Module,
     device=torch.device("cuda"),
     float8_dtype=torch.float8_e4m3fn,
     input_float8_dtype=torch.float8_e5m2,
     offload_flow=False,
-):
     """
     Quantize the flux flow transformer model (original BFL codebase version) and dispatch to the given device.
     """
-    for i, module in enumerate(flow_model.double_blocks):
         module.to(device)
         module.eval()
         recursive_swap_linears(
             module, float8_dtype=float8_dtype, input_float8_dtype=input_float8_dtype
         )
         torch.cuda.empty_cache()
-    for i, module in enumerate(flow_model.single_blocks):
         module.to(device)
         module.eval()
         recursive_swap_linears(

 import torch
 import torch.nn as nn
 from torchao.float8.float8_utils import (
 from torch.compiler import is_compiling
 from torch import __version__
 from torch.version import cuda
+from typing import TypeVar
 IS_TORCH_2_4 = __version__ < (2, 4, 9)
 LT_TORCH_2_4 = __version__ < (2, 4)
 except ImportError:
     CublasLinear = type(None)
+FluxType = TypeVar("FluxType", nn.Module)
 class F8Linear(nn.Module):
             init.uniform_(self.bias, -bound, bound)
         self.quantize_weight()
         self.max_value = torch.finfo(self.float8_dtype).max
+        self.input_max_value = torch.finfo(self.input_float8_dtype).max
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.input_scale_initialized or is_compiling():
         linear: nn.Linear,
         float8_dtype=torch.float8_e4m3fn,
         input_float8_dtype=torch.float8_e5m2,
+    ) -> "F8Linear":
         f8_lin = cls(
             in_features=linear.in_features,
             out_features=linear.out_features,
     model: nn.Module,
     float8_dtype=torch.float8_e4m3fn,
     input_float8_dtype=torch.float8_e5m2,
+) -> None:
     """
     Recursively swaps all nn.Linear modules in the given model with F8Linear modules.
 @torch.inference_mode()
 def quantize_flow_transformer_and_dispatch_float8(
+    flow_model: FluxType,
     device=torch.device("cuda"),
     float8_dtype=torch.float8_e4m3fn,
     input_float8_dtype=torch.float8_e5m2,
     offload_flow=False,
+) -> FluxType:
     """
     Quantize the flux flow transformer model (original BFL codebase version) and dispatch to the given device.
+    Iteratively pushes each module to device, evals, replaces linear layers with F8Linear except for final_layer, and quantizes.
+    Allows for fast dispatch to gpu & quantize without causing OOM on gpus with limited memory.
+    After dispatching, if offload_flow is True, offloads the model to cpu.
     """
+    for module in flow_model.double_blocks:
         module.to(device)
         module.eval()
         recursive_swap_linears(
             module, float8_dtype=float8_dtype, input_float8_dtype=input_float8_dtype
         )
         torch.cuda.empty_cache()
+    for module in flow_model.single_blocks:
         module.to(device)
         module.eval()
         recursive_swap_linears(

flux_emphasis.py CHANGED Viewed

@@ -111,7 +111,9 @@ def parse_prompt_attention(text):
     return res
-def get_prompts_tokens_with_weights(clip_tokenizer: CLIPTokenizer, prompt: str):
     """
     Get prompt token ids and weights, this function works for both prompt and negative prompt
@@ -152,13 +154,14 @@ def get_prompts_tokens_with_weights(clip_tokenizer: CLIPTokenizer, prompt: str):
         ).input_ids
         # so that tokenize whatever length prompt
         # the returned token is a 1d list: [320, 1125, 539, 320]
-        print(
-            token,
-            "|FOR MODEL LEN{}|".format(maxlen),
-            clip_tokenizer.decode(
-                token, skip_special_tokens=True, clean_up_tokenization_spaces=True
-            ),
-        )
         # merge the new tokens to the all tokens holder: text_tokens
         text_tokens = [*text_tokens, *token]
@@ -306,6 +309,7 @@ def get_weighted_text_embeddings_flux(
     device: Optional[torch.device] = None,
     target_device: Optional[torch.device] = torch.device("cuda:0"),
     target_dtype: Optional[torch.dtype] = torch.bfloat16,
 ):
     """
     This function can process long prompt with weights, no length limitation
@@ -350,12 +354,12 @@ def get_weighted_text_embeddings_flux(
     # tokenizer 1
     prompt_tokens_clip, prompt_weights_clip = get_prompts_tokens_with_weights(
-        tokenizer_clip, prompt
     )
     # tokenizer 2
     prompt_tokens_t5, prompt_weights_t5 = get_prompts_tokens_with_weights(
-        tokenizer_t5, prompt
     )
     prompt_tokens_clip_grouped, prompt_weights_clip_grouped = group_tokens_and_weights(
@@ -428,7 +432,8 @@ def get_weighted_text_embeddings_flux(
         "last_hidden_state"
     ]
     t5_embeds = apply_weights(prompt_tokens_t5, weight_tensor_t5, t5_embeds, eos_2)
-    print(t5_embeds.shape)
     if t5_embeds.shape[0] == 1 and num_images_per_prompt > 1:
         t5_embeds = repeat(t5_embeds, "1 ... -> bs ...", bs=num_images_per_prompt)
     txt_ids = torch.zeros(

     return res
+def get_prompts_tokens_with_weights(
+    clip_tokenizer: CLIPTokenizer, prompt: str, debug: bool = False
+):
     """
     Get prompt token ids and weights, this function works for both prompt and negative prompt
         ).input_ids
         # so that tokenize whatever length prompt
         # the returned token is a 1d list: [320, 1125, 539, 320]
+        if debug:
+            print(
+                token,
+                "|FOR MODEL LEN{}|".format(maxlen),
+                clip_tokenizer.decode(
+                    token, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                ),
+            )
         # merge the new tokens to the all tokens holder: text_tokens
         text_tokens = [*text_tokens, *token]
     device: Optional[torch.device] = None,
     target_device: Optional[torch.device] = torch.device("cuda:0"),
     target_dtype: Optional[torch.dtype] = torch.bfloat16,
+    debug: bool = False,
 ):
     """
     This function can process long prompt with weights, no length limitation
     # tokenizer 1
     prompt_tokens_clip, prompt_weights_clip = get_prompts_tokens_with_weights(
+        tokenizer_clip, prompt, debug=debug
     )
     # tokenizer 2
     prompt_tokens_t5, prompt_weights_t5 = get_prompts_tokens_with_weights(
+        tokenizer_t5, prompt, debug=debug
     )
     prompt_tokens_clip_grouped, prompt_weights_clip_grouped = group_tokens_and_weights(
         "last_hidden_state"
     ]
     t5_embeds = apply_weights(prompt_tokens_t5, weight_tensor_t5, t5_embeds, eos_2)
+    if debug:
+        print(t5_embeds.shape)
     if t5_embeds.shape[0] == 1 and num_images_per_prompt > 1:
         t5_embeds = repeat(t5_embeds, "1 ... -> bs ...", bs=num_images_per_prompt)
     txt_ids = torch.zeros(

flux_pipeline.py CHANGED Viewed

@@ -3,7 +3,11 @@ import math
 from typing import TYPE_CHECKING, Callable, List
 from PIL import Image
 import numpy as np
 import torch
 from einops import rearrange
@@ -61,6 +65,7 @@ class FluxPipeline:
         clip_device: torch.device | str = "cuda:1",
         t5_device: torch.device | str = "cuda:1",
         config: ModelSpec = None,
     ):
         """
         Initialize the FluxPipeline class.
@@ -68,6 +73,7 @@ class FluxPipeline:
         This class is responsible for preparing input tensors for the Flux model, generating
         timesteps and noise, and handling device management for model offloading.
         """
         self.name = name
         self.device_flux = (
             flux_device
@@ -113,7 +119,7 @@ class FluxPipeline:
         if self.config.compile_blocks or self.config.compile_extras:
             if not self.config.prequantized_flow:
-                print("Warmups for compile...")
                 warmup_dict = dict(
                     prompt="A beautiful test image used to solidify the fp8 nn.Linear input scales prior to compilation 😉",
                     height=768,
@@ -204,6 +210,8 @@ class FluxPipeline:
         if self.offload_text_encoder:
             self.clip.to(device=self.device_clip)
             self.t5.to(device=self.device_t5)
         vec, txt, txt_ids = get_weighted_text_embeddings_flux(
             self,
             prompt,
@@ -211,7 +219,9 @@ class FluxPipeline:
             device=self.device_clip,
             target_device=target_device,
             target_dtype=target_dtype,
         )
         if self.offload_text_encoder:
             self.clip.to("cpu")
             self.t5.to("cpu")
@@ -494,6 +504,8 @@ class FluxPipeline:
         logger.info(f"Generating with:\nSeed: {seed}\nPrompt: {prompt}")
         generator = torch.Generator(device=self.device_flux).manual_seed(seed)
         img, timesteps = self.preprocess_latent(
             init_image=init_image,
             height=height,
@@ -503,6 +515,8 @@ class FluxPipeline:
             generator=generator,
             num_images=num_images,
         )
         img, img_ids, vec, txt, txt_ids = map(
             lambda x: x.contiguous(),
             self.prepare(
@@ -518,8 +532,11 @@ class FluxPipeline:
             (img.shape[0],), guidance, device=self.device_flux, dtype=self.dtype
         )
         t_vec = None
         if self.offload_flow:
             self.model.to(self.device_flux)
         for t_curr, t_prev in tqdm(
             zip(timesteps[:-1], timesteps[1:]), total=len(timesteps) - 1, disable=silent
         ):
@@ -532,6 +549,7 @@ class FluxPipeline:
                 )
             else:
                 t_vec = t_vec.reshape((img.shape[0],)).fill_(t_curr)
             pred = self.model.forward(
                 img=img,
                 img_ids=img_ids,
@@ -544,6 +562,7 @@ class FluxPipeline:
             img = img + (t_prev - t_curr) * pred
         if self.offload_flow:
             self.model.to("cpu")
         torch.cuda.empty_cache()
@@ -557,16 +576,18 @@ class FluxPipeline:
     @classmethod
     def load_pipeline_from_config_path(
-        cls, path: str, flow_model_path: str = None
     ) -> "FluxPipeline":
         with torch.inference_mode():
             config = load_config_from_path(path)
             if flow_model_path:
                 config.ckpt_path = flow_model_path
-            return cls.load_pipeline_from_config(config)
     @classmethod
-    def load_pipeline_from_config(cls, config: ModelSpec) -> "FluxPipeline":
         from float8_quantize import quantize_flow_transformer_and_dispatch_float8
         with torch.inference_mode():
@@ -603,4 +624,5 @@ class FluxPipeline:
             clip_device=clip_device,
             t5_device=t5_device,
             config=config,
         )

 from typing import TYPE_CHECKING, Callable, List
 from PIL import Image
 import numpy as np
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=DeprecationWarning)
 import torch
 from einops import rearrange
         clip_device: torch.device | str = "cuda:1",
         t5_device: torch.device | str = "cuda:1",
         config: ModelSpec = None,
+        debug: bool = False,
     ):
         """
         Initialize the FluxPipeline class.
         This class is responsible for preparing input tensors for the Flux model, generating
         timesteps and noise, and handling device management for model offloading.
         """
+        self.debug = debug
         self.name = name
         self.device_flux = (
             flux_device
         if self.config.compile_blocks or self.config.compile_extras:
             if not self.config.prequantized_flow:
+                logger.info("Running warmups for compile...")
                 warmup_dict = dict(
                     prompt="A beautiful test image used to solidify the fp8 nn.Linear input scales prior to compilation 😉",
                     height=768,
         if self.offload_text_encoder:
             self.clip.to(device=self.device_clip)
             self.t5.to(device=self.device_t5)
+        # get the text embeddings
         vec, txt, txt_ids = get_weighted_text_embeddings_flux(
             self,
             prompt,
             device=self.device_clip,
             target_device=target_device,
             target_dtype=target_dtype,
+            debug=self.debug,
         )
+        # offload text encoder to cpu if needed
         if self.offload_text_encoder:
             self.clip.to("cpu")
             self.t5.to("cpu")
         logger.info(f"Generating with:\nSeed: {seed}\nPrompt: {prompt}")
         generator = torch.Generator(device=self.device_flux).manual_seed(seed)
+        # preprocess the latent
         img, timesteps = self.preprocess_latent(
             init_image=init_image,
             height=height,
             generator=generator,
             num_images=num_images,
         )
+        # prepare inputs
         img, img_ids, vec, txt, txt_ids = map(
             lambda x: x.contiguous(),
             self.prepare(
             (img.shape[0],), guidance, device=self.device_flux, dtype=self.dtype
         )
         t_vec = None
+        # dispatch to gpu if offloaded
         if self.offload_flow:
             self.model.to(self.device_flux)
+        # perform the denoising loop
         for t_curr, t_prev in tqdm(
             zip(timesteps[:-1], timesteps[1:]), total=len(timesteps) - 1, disable=silent
         ):
                 )
             else:
                 t_vec = t_vec.reshape((img.shape[0],)).fill_(t_curr)
             pred = self.model.forward(
                 img=img,
                 img_ids=img_ids,
             img = img + (t_prev - t_curr) * pred
+        # offload the model to cpu if needed
         if self.offload_flow:
             self.model.to("cpu")
         torch.cuda.empty_cache()
     @classmethod
     def load_pipeline_from_config_path(
+        cls, path: str, flow_model_path: str = None, debug: bool = False
     ) -> "FluxPipeline":
         with torch.inference_mode():
             config = load_config_from_path(path)
             if flow_model_path:
                 config.ckpt_path = flow_model_path
+            return cls.load_pipeline_from_config(config, debug=debug)
     @classmethod
+    def load_pipeline_from_config(
+        cls, config: ModelSpec, debug: bool = False
+    ) -> "FluxPipeline":
         from float8_quantize import quantize_flow_transformer_and_dispatch_float8
         with torch.inference_mode():
             clip_device=clip_device,
             t5_device=t5_device,
             config=config,
+            debug=debug,
         )

modules/conditioner.py CHANGED Viewed

@@ -14,19 +14,6 @@ from transformers.utils.quantization_config import QuantoConfig, BitsAndBytesCon
 CACHE_DIR = os.environ.get("HF_HOME", "~/.cache/huggingface")
-def into_quantization_name(quantization_dtype: str) -> str:
-    if quantization_dtype == "qfloat8":
-        return "float8"
-    elif quantization_dtype == "qint4":
-        return "int4"
-    elif quantization_dtype == "qint8":
-        return "int8"
-    elif quantization_dtype == "qint2":
-        return "int2"
-    else:
-        raise ValueError(f"Unsupported quantization dtype: {quantization_dtype}")
 def auto_quantization_config(
     quantization_dtype: str,
 ) -> QuantoConfig | BitsAndBytesConfig:

 CACHE_DIR = os.environ.get("HF_HOME", "~/.cache/huggingface")
 def auto_quantization_config(
     quantization_dtype: str,
 ) -> QuantoConfig | BitsAndBytesConfig:

quantize_swap_and_dispatch.py DELETED Viewed

@@ -1,274 +0,0 @@
-from fnmatch import fnmatch
-from typing import List, Optional, Union
-import torch
-from click import secho
-from cublas_ops import CublasLinear
-from quanto import (
-    QModuleMixin,
-    quantize_module,
-    QLinear,
-    QConv2d,
-    QLayerNorm,
-)
-from quanto.tensor import Optimizer, qtype, qfloat8, qint4, qint8
-from torch import nn
-class QuantizationDtype:
-    qfloat8 = "qfloat8"
-    qint2 = "qint2"
-    qint4 = "qint4"
-    qint8 = "qint8"
-def into_qtype(qtype: QuantizationDtype) -> qtype:
-    if qtype == QuantizationDtype.qfloat8:
-        return qfloat8
-    elif qtype == QuantizationDtype.qint4:
-        return qint4
-    elif qtype == QuantizationDtype.qint8:
-        return qint8
-    else:
-        raise ValueError(f"Unknown qtype: {qtype}")
-def _set_module_by_name(parent_module, name, child_module):
-    module_names = name.split(".")
-    if len(module_names) == 1:
-        setattr(parent_module, name, child_module)
-    else:
-        parent_module_name = name[: name.rindex(".")]
-        parent_module = parent_module.get_submodule(parent_module_name)
-        setattr(parent_module, module_names[-1], child_module)
-def _quantize_submodule(
-    model: torch.nn.Module,
-    name: str,
-    module: torch.nn.Module,
-    weights: Optional[Union[str, qtype]] = None,
-    activations: Optional[Union[str, qtype]] = None,
-    optimizer: Optional[Optimizer] = None,
-):
-    if isinstance(module, CublasLinear):
-        return 0
-    num_quant = 0
-    qmodule = quantize_module(
-        module, weights=weights, activations=activations, optimizer=optimizer
-    )
-    if qmodule is not None:
-        _set_module_by_name(model, name, qmodule)
-        # num_quant += 1
-        qmodule.name = name
-        for name, param in module.named_parameters():
-            # Save device memory by clearing parameters
-            setattr(module, name, None)
-            del param
-        num_quant += 1
-    return num_quant
-def _quantize(
-    model: torch.nn.Module,
-    weights: Optional[Union[str, qtype]] = None,
-    activations: Optional[Union[str, qtype]] = None,
-    optimizer: Optional[Optimizer] = None,
-    include: Optional[Union[str, List[str]]] = None,
-    exclude: Optional[Union[str, List[str]]] = None,
-):
-    """Quantize the specified model submodules
-    Recursively quantize the submodules of the specified parent model.
-    Only modules that have quantized counterparts will be quantized.
-    If include patterns are specified, the submodule name must match one of them.
-    If exclude patterns are specified, the submodule must not match one of them.
-    Include or exclude patterns are Unix shell-style wildcards which are NOT regular expressions. See
-    https://docs.python.org/3/library/fnmatch.html for more details.
-    Note: quantization happens in-place and modifies the original model and its descendants.
-    Args:
-        model (`torch.nn.Module`): the model whose submodules will be quantized.
-        weights (`Optional[Union[str, qtype]]`): the qtype for weights quantization.
-        activations (`Optional[Union[str, qtype]]`): the qtype for activations quantization.
-        include (`Optional[Union[str, List[str]]]`):
-            Patterns constituting the allowlist. If provided, module names must match at
-            least one pattern from the allowlist.
-        exclude (`Optional[Union[str, List[str]]]`):
-            Patterns constituting the denylist. If provided, module names must not match
-            any patterns from the denylist.
-    """
-    num_quant = 0
-    if include is not None:
-        include = [include] if isinstance(include, str) else exclude
-    if exclude is not None:
-        exclude = [exclude] if isinstance(exclude, str) else exclude
-    for name, m in model.named_modules():
-        if include is not None and not any(
-            fnmatch(name, pattern) for pattern in include
-        ):
-            continue
-        if exclude is not None and any(fnmatch(name, pattern) for pattern in exclude):
-            continue
-        num_quant += _quantize_submodule(
-            model,
-            name,
-            m,
-            weights=weights,
-            activations=activations,
-            optimizer=optimizer,
-        )
-    return num_quant
-def _freeze(model):
-    for name, m in model.named_modules():
-        if isinstance(m, QModuleMixin):
-            m.freeze()
-def _is_block_compilable(module: nn.Module) -> bool:
-    for module in module.modules():
-        if _is_quantized(module):
-            return False
-    if _is_quantized(module):
-        return False
-    return True
-def _simple_swap_linears(model: nn.Module, root_name: str = ""):
-    for name, module in model.named_children():
-        if (
-            _is_linear(module)
-            and hasattr(module, "weight")
-            and module.weight is not None
-            and module.weight.data is not None
-        ):
-            weights = module.weight.data
-            bias = None
-            if module.bias is not None:
-                bias = module.bias.data
-            with torch.device(module.weight.device):
-                new_cublas = CublasLinear(
-                    module.in_features,
-                    module.out_features,
-                    bias=bias is not None,
-                    device=module.weight.device,
-                    dtype=module.weight.dtype,
-                )
-            new_cublas.weight.data = weights
-            if bias is not None:
-                new_cublas.bias.data = bias
-            setattr(model, name, new_cublas)
-            if root_name == "":
-                secho(f"Replaced {name} with CublasLinear", fg="green")
-            else:
-                secho(f"Replaced {root_name}.{name} with CublasLinear", fg="green")
-        else:
-            if root_name == "":
-                _simple_swap_linears(module, str(name))
-            else:
-                _simple_swap_linears(module, str(root_name) + "." + str(name))
-def _full_quant(
-    model, max_quants=24, current_quants=0, quantization_dtype: qtype = qfloat8
-):
-    if current_quants < max_quants:
-        current_quants += _quantize(model, quantization_dtype)
-        _freeze(model)
-        print(f"Quantized {current_quants} modules with {quantization_dtype}")
-    return current_quants
-def _is_linear(module: nn.Module) -> bool:
-    return not isinstance(
-        module, (QLinear, QConv2d, QLayerNorm, CublasLinear)
-    ) and isinstance(module, nn.Linear)
-def _is_quantized(module: nn.Module) -> bool:
-    return isinstance(module, (QLinear, QConv2d, QLayerNorm))
-def quantize_and_dispatch_to_device(
-    flow_model: nn.Module,
-    flux_device: torch.device = torch.device("cuda"),
-    flux_dtype: torch.dtype = torch.float16,
-    num_layers_to_quantize: int = 20,
-    quantization_dtype: QuantizationDtype = QuantizationDtype.qfloat8,
-    compile_blocks: bool = True,
-    compile_extras: bool = True,
-    quantize_extras: bool = False,
-    replace_linears: bool = True,
-):
-    quant_type = into_qtype(quantization_dtype)
-    num_quanted = 0
-    flow_model = flow_model.requires_grad_(False).eval().type(flux_dtype)
-    for block in flow_model.single_blocks:
-        block.cuda(flux_device)
-        if num_quanted < num_layers_to_quantize:
-            num_quanted = _full_quant(
-                block,
-                num_layers_to_quantize,
-                num_quanted,
-                quantization_dtype=quant_type,
-            )
-    for block in flow_model.double_blocks:
-        block.cuda(flux_device)
-        if num_quanted < num_layers_to_quantize:
-            num_quanted = _full_quant(
-                block,
-                num_layers_to_quantize,
-                num_quanted,
-                quantization_dtype=quant_type,
-            )
-    to_gpu_extras = [
-        "vector_in",
-        "img_in",
-        "txt_in",
-        "time_in",
-        "guidance_in",
-        "final_layer",
-        "pe_embedder",
-    ]
-    if compile_blocks:
-        for i, block in enumerate(flow_model.single_blocks):
-            if _is_block_compilable(block):
-                block.compile()
-                secho(f"Compiled block {i}", fg="green")
-        for i, block in enumerate(flow_model.double_blocks):
-            if _is_block_compilable(block):
-                block.compile()
-                secho(f"Compiled block {i}", fg="green")
-    if replace_linears:
-        _simple_swap_linears(flow_model)
-    for extra in to_gpu_extras:
-        m_extra = getattr(flow_model, extra).cuda(flux_device).type(flux_dtype)
-        if compile_extras:
-            if extra in ["time_in", "vector_in", "guidance_in", "final_layer"]:
-                m_extra.compile()
-                secho(
-                    f"Compiled extra {extra} -- {m_extra.__class__.__name__}",
-                    fg="green",
-                )
-        elif quantize_extras:
-            if not isinstance(m_extra, nn.Linear):
-                _full_quant(
-                    m_extra,
-                    current_quants=num_quanted,
-                    max_quants=num_layers_to_quantize,
-                    quantization_dtype=quantization_dtype,
-                )
-    return flow_model

util.py CHANGED Viewed

@@ -42,6 +42,7 @@ class ModelSpec(BaseModel):
     flow_dtype: str = "float16"
     ae_dtype: str = "bfloat16"
     text_enc_dtype: str = "bfloat16"
     num_to_quant: Optional[int] = 20
     quantize_extras: bool = False
     compile_extras: bool = False

     flow_dtype: str = "float16"
     ae_dtype: str = "bfloat16"
     text_enc_dtype: str = "bfloat16"
+    # unused / deprecated
     num_to_quant: Optional[int] = 20
     quantize_extras: bool = False
     compile_extras: bool = False